diff --git a/NEWS.md b/NEWS.md index 28b5856a24..863107aa8c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,11 +4,13 @@ See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a compl ## v0.15.0 * Recurrent layers have undergone a complete redesign in [PR 2500](https://github.com/FluxML/Flux.jl/pull/2500). -* `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally. Instead, they now take the previous state as input and return the updated state as output. -* These layers (`RNN`, `LSTM`, `GRU`) now process entire sequences at once, rather than one element at a time. -* The `Recur` wrapper has been deprecated and removed. -* The `reset!` function has also been removed; state management is now entirely up to the user. -* `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing. + * `RNNCell`, `LSTMCell`, and `GRUCell` are now exported and provide functionality for single time-step processing: `rnncell(x_t, h_t) -> h_{t+1}`. + * `RNN`, `LSTM`, and `GRU` no longer store the hidden state internally, it has to be explicitely passed to the layer. Moreover, they now process entire sequences at once, rather than one element at a time: `rnn(x, h) -> h′`. + * The `Recur` wrapper has been deprecated and removed. + * The `reset!` function has also been removed; state management is now entirely up to the user. +* The `Flux.Optimise` module has been deprecated in favor of the Optimisers.jl package. + Now Flux re-exports the optimisers from Optimisers.jl. Most users will be uneffected by this change. + The module is still available for now, but will be removed in a future release. ## v0.14.22 * Data movement between devices is now provided by [MLDataDevices.jl](https://github.com/LuxDL/MLDataDevices.jl). diff --git a/docs/src/guide/models/overview.md b/docs/src/guide/models/overview.md index 71eff0d33f..8bb88833c8 100644 --- a/docs/src/guide/models/overview.md +++ b/docs/src/guide/models/overview.md @@ -95,7 +95,7 @@ Under the hood, the Flux [`Flux.train!`](@ref) function uses *a loss function* a julia> using Flux: train! julia> opt = Descent() -Descent(0.1) +Descent(0.1f0) julia> data = [(x_train, y_train)] 1-element Vector{Tuple{Matrix{Int64}, Matrix{Int64}}}: diff --git a/src/Flux.jl b/src/Flux.jl index 5dd193ef2b..2804803947 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -12,6 +12,8 @@ using MLUtils const stack = MLUtils.stack # now exported by Base import Optimisers: Optimisers, trainable, destructure # before v0.13, Flux owned these functions using Optimisers: freeze!, thaw!, adjust!, trainables +@reexport using Optimisers + using Random: default_rng using Zygote, ChainRulesCore using Zygote: Params, @adjoint, gradient, pullback @@ -56,13 +58,8 @@ export Chain, Dense, Embedding, EmbeddingBag, )) include("optimise/Optimise.jl") -using .Optimise -export Descent, Adam, Momentum, Nesterov, RMSProp, - AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam, - AdamW, RAdam, AdaBelief, InvDecay, ExpDecay, - WeightDecay, SignDecay, ClipValue, ClipNorm - -export ClipGrad, OptimiserChain # these are const defined in deprecations, for ClipValue, Optimiser +using .Optimise: Optimise +export ClipValue # this is const defined in deprecations, for ClipGrad include("train.jl") using .Train diff --git a/src/deprecations.jl b/src/deprecations.jl index 0d842f786b..57ea3bf72a 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -41,31 +41,40 @@ train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error """) train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) = - train!(loss, model, data, _old_to_new(opt); cb) + train!(loss, model, data, __old_to_new(opt); cb) # Next, to use the new `setup` with the still-exported old-style `Adam` etc: import .Train: setup -setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model) +setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model) # ... and allow accidental use of `Optimisers.setup` to do the same: -Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(_old_to_new(rule), model) +Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model) + + +function __old_to_new(rule) + Base.depwarn("""Optimisers from Flux.Optimise module are deprecated. + Use optimisers from Optimisers.jl instead.""", :__old_to_new) + return _old_to_new(rule) +end for T in [:Descent, :Adam, :Momentum, :Nesterov, :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief, # :InvDecay, :ExpDecay, :SignDecay, ] - @eval function _old_to_new(rule::$T) + @eval function _old_to_new(rule::Optimise.$T) args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T)) Optimisers.$T(args...) end end -_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...) -const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too. -_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now -_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields -_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs -const ClipGrad = Optimise.ClipValue -_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred +_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...) +# const OptimiserChain = Optimise.Optimiser # lets you use new name with implicit params too. +const Optimiser = Optimisers.OptimiserChain +_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd) # called lambda now +_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh) # called omega, and there are more fields +_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh) # called delta now, and struct name differs +# const ClipGrad = Optimise.ClipValue +const ClipValue = Optimisers.ClipGrad +_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon) # RMSProp has no field centred _old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule") @@ -83,8 +92,21 @@ function update!(opt::Optimise.AbstractOptimiser, model, grad) # to accept only arrays. Remove if this causes problems! # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄) error("""Invalid input to `update!`. - * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)` - * For the explicit style, `update(state, model, grad)` needs `state = Flux.setup(opt, model)`. + * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)` + * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`. + """) +end + +# TODO this friendly error should go in Optimisers.jl. +# remove after https://github.com/FluxML/Optimisers.jl/pull/181 +function update!(opt::Optimisers.AbstractRule, model, grad) + error("""Invalid input to `update!`. + `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`. + """) +end +function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple) + error("""Invalid input to `update!`. + `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`. """) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index 8ba07b95a8..14ed11e319 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -145,7 +145,7 @@ Conv((3,), 4 => 5, σ) # 65 parameters julia> layer(randn(100, 4, 64)) |> size (98, 5, 64) -julia> Flux.params(layer) |> length +julia> Flux.trainables(layer) |> length 2 ``` """ @@ -294,7 +294,7 @@ ConvTranspose((3,), 5 => 4, σ) # 64 parameters julia> layer(randn(100, 5, 64)) |> size # transposed convolution will increase the dimension size (upsampling) (102, 4, 64) -julia> Flux.params(layer) |> length +julia> Flux.trainables(layer) |> length 2 ``` """ diff --git a/src/layers/show.jl b/src/layers/show.jl index 67cf49e996..f3fc170ec5 100644 --- a/src/layers/show.jl +++ b/src/layers/show.jl @@ -104,15 +104,15 @@ function _layer_show(io::IO, layer, indent::Int=0, name=nothing) _str = isnothing(name) ? "" : "$name = " str = _str * _layer_string(io, layer) print(io, " "^indent, str, indent==0 ? "" : ",") - if !isempty(params(layer)) + if !isempty(trainables(layer)) print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str))) - printstyled(io, "# ", underscorise(sum(length, params(layer); init=0)), " parameters"; + printstyled(io, "# ", underscorise(sum(length, trainables(layer); init=0)), " parameters"; color=:light_black) - nonparam = _childarray_sum(length, layer) - sum(length, params(layer), init=0) + nonparam = _childarray_sum(length, layer) - sum(length, trainables(layer), init=0) if nonparam > 0 printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black) end - _nan_show(io, params(layer)) + _nan_show(io, trainables(layer)) end indent==0 || println(io) end @@ -127,7 +127,7 @@ function _layer_string(::IO, a::AbstractArray) end function _big_finale(io::IO, m) - ps = params(m) + ps = trainables(m) if length(ps) > 2 pars = underscorise(sum(length, ps; init=0)) bytes = Base.format_bytes(Base.summarysize(m)) diff --git a/src/outputsize.jl b/src/outputsize.jl index 5d6132d059..c413405048 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -302,8 +302,6 @@ function ChainRulesCore.rrule(::typeof(striplazy), m) striplazy(m), _ -> error("striplazy should never be used within a gradient") end -params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.") - Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.") function Base.show(io::IO, l::LazyLayer) diff --git a/test/data.jl b/test/data.jl index b97c4dae80..77ba99d133 100644 --- a/test/data.jl +++ b/test/data.jl @@ -80,18 +80,20 @@ using Random # test interaction with `train!` θ = ones(2) X = zeros(2, 10) - loss(x) = sum((x .- θ).^2) + loss(θ, x) = sum((x .- θ).^2) d = DataLoader(X) - Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1)) + opt_state = Flux.setup(Descent(0.1), θ) + Flux.train!(loss, θ, ncycle(d, 10), opt_state) @test norm(θ) < 1e-4 # test interaction with `train!` θ = zeros(2) X = ones(2, 10) Y = fill(2, 10) - loss(x, y) = sum((y - x'*θ).^2) + loss(θ, x, y) = sum((y - x'*θ).^2) d = DataLoader((X, Y)) - Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1)) + opt_state = Flux.setup(Descent(0.1), θ) + Flux.train!(loss, θ, ncycle(d, 10), opt_state) @test norm(θ .- 1) < 1e-10 # specify the rng diff --git a/test/ext_cuda/cuda.jl b/test/ext_cuda/cuda.jl index e4be91cd02..066998f14c 100644 --- a/test/ext_cuda/cuda.jl +++ b/test/ext_cuda/cuda.jl @@ -21,7 +21,7 @@ CUDA.allowscalar(false) m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) cm = gpu(m) - @test all(p isa CuArray for p in Flux.params(cm)) + @test all(p isa CuArray for p in Flux.trainables(cm)) @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} xs = rand(5, 5) diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl index 7b7ceb7114..cdb8f003e9 100644 --- a/test/ext_cuda/layers.jl +++ b/test/ext_cuda/layers.jl @@ -110,8 +110,8 @@ end l = cl((2,2), 1=>3, bias = false) |> gpu ip = zeros(Float32, 28,28,1,1) |> gpu @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params + gs = gradient(l -> sum(l(ip)), l)[1] + @test gs.bias === nothing end @testset "Dense without bias" begin @@ -119,8 +119,8 @@ end ip = zeros(Float32, 3, 7) |> gpu @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params + gs = gradient(l -> sum(l(ip)), l)[1] + @test gs.bias === nothing end @testset "Extended BatchNorm" begin @@ -133,13 +133,13 @@ end μ_cpu = copy(m_cpu.μ) m_cpu(x_cpu) @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu) @test !(m_cpu.μ ≈ μ_cpu) μ_gpu = copy(m_gpu.μ) m_gpu(x_gpu) @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu) @test !(m_gpu.μ ≈ μ_gpu) @test Array(m_gpu.μ) ≈ m_cpu.μ @@ -149,14 +149,14 @@ end μ_cpu = copy(m_cpu.μ) m_cpu(x_cpu) @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu) @test m_cpu.μ ≈ μ_cpu testmode!(m_gpu) μ_gpu = copy(m_gpu.μ) m_gpu(x_gpu) @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu) @test m_gpu.μ ≈ μ_gpu ## In trainmode, always track statistics @@ -165,7 +165,7 @@ end m_cpu(x_cpu) @test !(m_cpu.μ ≈ μ_cpu) μ_cpu = copy(m_cpu.μ) - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + gradient(m_cpu -> sum(m_cpu(x_cpu)), m_cpu) @test !(m_cpu.μ ≈ μ_cpu) trainmode!(m_gpu) @@ -173,44 +173,28 @@ end m_gpu(x_gpu) @test !(m_gpu.μ ≈ μ_gpu) μ_gpu = copy(m_gpu.μ) - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + gradient(m_gpu -> sum(m_gpu(x_gpu)), m_gpu) @test !(m_gpu.μ ≈ μ_gpu) - - ## No errors if input type mistmatch - # x_cpu = rand(Float64, 3, 2, 2) - # x_gpu = x_cpu |> gpu - # m_cpu(x_cpu) - # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - # m_gpu(x_gpu) - # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) end @testset "Two-streams Bilinear" begin x = zeros(Float32,10,9) |> gpu y = zeros(Float32,2,9) |> gpu b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end + @test size(b(x, y)) == (3,9) + @test sum(abs2, b(x, y)) ≈ 0f0 + test_gradients(b |> cpu, x |> cpu, y |> cpu, + test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y))) end @testset "Two-streams Bilinear" begin x = zeros(Float32,10,9) |> gpu y = zeros(Float32,2,9) |> gpu b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end + @test size(b(x, y)) == (3,9) + @test sum(abs2, b(x, y)) ≈ 0f0 + test_gradients(b |> cpu, x |> cpu, y |> cpu, + test_gpu=true, compare_finite_diff=false, loss=(m, x, y) -> mean(abs2, m(x, y))) end @testset "Parallel" begin @@ -228,15 +212,9 @@ end end @testset "gradient" begin - input_cpu = randn(10, 10, 10, 10) - input_gpu = input_cpu |> gpu layer_cpu = Parallel(+, x -> zero(x), identity) - layer_gpu = layer_cpu |> gpu - gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) - gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) - for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) - @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] - end + test_gradients(layer_cpu, randn(2, 2, 2, 2), + test_gpu=true, compare_finite_diff=false, loss=(m, x) -> mean(abs2, m(x))) end end diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 8e33340611..c95c8c8288 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -42,11 +42,11 @@ using Flux: activations @testset "Activations" begin c = Chain(Dense(3,5,relu), Dense(5,1,relu)) X = Float32.([1.0; 1.0; 1.0]) - @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c)) + @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c) c2 = Chain(enc = c[1], dec = c[2]) @test Flux.activations(c, X) == Flux.activations(c2, X) - @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2)) + @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c2) end @testset "Dense" begin @@ -158,9 +158,9 @@ using Flux: activations @test mo(input) == target end - @testset "params" begin + @testset "trainables" begin mo = Maxout(()->Dense(32, 64), 4) - ps = Flux.params(mo) + ps = Flux.trainables(mo) @test length(ps) == 8 #4 alts, each with weight and bias end end @@ -198,7 +198,7 @@ using Flux: activations x = randn(Float32,11,7) b = Flux.Bilinear(11, 11, 3) @test size(b(x)) == (3,7) - @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b)) + test_gradients(b, x) end @testset "constructors" begin @@ -447,16 +447,15 @@ end @testset "gradients of Chain{Vector}" begin m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2)) m1v = Chain([m1[1], m1[2]]) - @test sum(length, params(m1)) == sum(length, params(m1v)) + @test sum(length, Flux.trainables(m1)) == sum(length, Flux.trainables(m1v)) x1 = randn(Float32,3,5) @test m1(x1) ≈ m1v(x1) y1 = rand(Bool,2,5) - g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1)) - g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v)) - @test g1[m1[1].weight] ≈ g1v[m1v[1].weight] - @test g1[m1[2].bias] ≈ g1v[m1v[2].bias] + g1 = gradient(m1 -> Flux.logitcrossentropy(m1(x1), y1), m1)[1] + g1v = gradient(m1v -> Flux.logitcrossentropy(m1v(x1), y1), m1v)[1] + check_equal_leaves(g1, g1v) @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1] z1 = rand(22); diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 2e75a1e39d..8780fef957 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -43,28 +43,30 @@ end @test sum(op) == prod(size(op)) @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32) - bias = Conv((2,2), 1=>3, bias = false) |> lmap - op = bias(ip) + model = Conv((2,2), 1=>3, bias = false) |> lmap + op = model(ip) @test sum(op) ≈ 0.f0 - gs = gradient(() -> sum(bias(ip)), Flux.params(bias)) - @test bias.bias ∉ gs.params + g = gradient(m -> sum(m(ip)), model)[1] + @test g.bias isa Nothing end - # Train w/o bias and make sure no convergence happens - # when only bias can be converged - bias = Conv((2, 2), 1=>3, bias = false); - ip = zeros(Float32, 28,28,1,1) - op = zeros(Float32, 27,27,3,1) .+ 2.f0 - opt = Descent() - - for _ = 1:10^3 - gs = gradient(Flux.params(bias)) do - Flux.Losses.mse(bias(ip), op) + @testset "no bias train" begin + # Train w/o bias and make sure no convergence happens + # when only bias can be converged + model = Conv((2, 2), 1=>3, bias = false); + ip = zeros(Float32, 28,28,1,1) + op = zeros(Float32, 27,27,3,1) .+ 2.f0 + opt_state = Flux.setup(Descent(), model) + + for _ = 1:10^3 + g = gradient(model) do m + Flux.mse(m(ip), op) + end[1] + Flux.update!(opt_state, model, g) end - Flux.Optimise.update!(opt, params(bias), gs) - end - @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0 + @test Flux.Losses.mse(model(ip), op) ≈ 4.f0 + end @testset "Grouped Conv" begin ip = rand(Float32, 28, 100, 2) @@ -164,11 +166,11 @@ end m = ConvTranspose((3,3), 1=>1) # Test that the gradient call does not throw: #900 - @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m)[1] x = zeros(Float32, 5, 5, 2, 4) m = ConvTranspose((3,3), 2=>3) - @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m)[1] # test ConvTranspose supports groups argument x = randn(Float32, 10, 10, 2, 3) @@ -178,7 +180,7 @@ end m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad()) @test size(m2.weight) == (3,3,2,2) @test size(m1(x)) == size(m2(x)) - @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads + g = gradient(m -> sum(m(x)), m2)[1] x = randn(Float32, 10, 2,1) m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index be7c5dec92..f678297eaa 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -129,7 +129,7 @@ end 2.0 4.0 6.0] @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 @test m.β == [0, 0] # initβ(2) @test m.γ == [1, 1] # initγ(2) @@ -211,9 +211,9 @@ end @inferred m(x) end - @test length(Flux.params(BatchNorm(10))) == 2 - @test length(Flux.params(BatchNorm(10, affine=true))) == 2 - @test length(Flux.params(BatchNorm(10, affine=false))) == 0 + @test length(Flux.trainables(BatchNorm(10))) == 2 + @test length(Flux.trainables(BatchNorm(10, affine=true))) == 2 + @test length(Flux.trainables(BatchNorm(10, affine=false))) == 0 @test BatchNorm(5; active=true).active === true @test_throws Exception BatchNorm(5; active=:something_else) @@ -224,7 +224,7 @@ end let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float32.(x) @test m.β == [0, 0] # initβ(2) @test m.γ == [1, 1] # initγ(2) @@ -287,7 +287,7 @@ end x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float64.(x) y = m(x) μ = mean(x, dims=1) @@ -300,7 +300,7 @@ end let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + @test length(Flux.trainables(m)) == 0 x = Float64.(x) y = m(x) @@ -345,9 +345,9 @@ end @inferred m(x) end - @test length(Flux.params(InstanceNorm(10))) == 0 - @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 - @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 + @test length(Flux.trainables(InstanceNorm(10))) == 0 + @test length(Flux.trainables(InstanceNorm(10, affine=true))) == 2 + @test length(Flux.trainables(InstanceNorm(10, affine=false))) == 0 @test InstanceNorm(5; active=true).active === true @test_throws Exception InstanceNorm(5; active=:something_else) @@ -370,10 +370,10 @@ end m = LayerNorm((2,3,4)) @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 m = LayerNorm((2,3,4), affine=false) @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + @test length(Flux.trainables(m)) == 0 end @testset "GroupNorm" begin @@ -383,7 +383,7 @@ end let m = GroupNorm(4,2), sizes = (3,4,2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 + @test length(Flux.trainables(m)) == 2 x = Float32.(x) @test m.β == [0, 0, 0, 0] # initβ(32) @test m.γ == [1, 1, 1, 1] # initγ(32) diff --git a/test/optimise.jl b/test/optimise.jl deleted file mode 100644 index c63ba85727..0000000000 --- a/test/optimise.jl +++ /dev/null @@ -1,222 +0,0 @@ -using Flux.Optimise -using Flux.Optimise: runall -using Flux: Params, gradient -import FillArrays, ComponentArrays -import Optimisers -using Test -using Random - -@testset "Optimise" begin - # Ensure rng has different state inside and outside the inner @testset - # so that w and w' are different - Random.seed!(84) - w = randn(10, 10) - @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), - NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), - Nesterov(), RMSProp(), Momentum()] - Random.seed!(42) - w′ = randn(10, 10) - b = false - loss(x) = Flux.Losses.mse(w*x, w′*x .+ b) - for t = 1: 10^5 - θ = params([w′, b]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Optimiser" begin - Random.seed!(84) - w = randn(10, 10) - @testset for Opt in [InvDecay, WeightDecay, ExpDecay, SignDecay] - Random.seed!(42) - w′ = randn(10, 10) - loss(x) = Flux.Losses.mse(w*x, w′*x) - opt = Optimiser(Opt(), Adam(0.001)) - for t = 1:10^5 - θ = Params([w′]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) - end - @test loss(rand(10, 10)) < 0.01 - end -end - -@testset "Training Loop" begin - - # Test multiple callbacks - x = 0 - fs = [() -> (), () -> x = 1] - cbs = runall(fs) - cbs() - @test x == 1 - - r = rand(3, 3) - loss(x) = sum(x .* x) - Flux.train!(loss, Flux.params(r), (r,), Descent()) -end - -@testset "Stop on NaN" begin - m = Dense(1 => 1) - m.weight .= 0 - CNT = 0 - @test_throws DomainError Flux.train!(Flux.params(m), 1:100, Descent(0.1)) do i - CNT += 1 - (i == 51 ? NaN32 : 1f0) * sum(m([1.0])) - end - @test CNT == 51 # stopped early - @test m.weight[1] ≈ -5 # did not corrupt weights -end - -@testset "ExpDecay" begin - - @testset "Sanity Check" begin - o = ExpDecay(0.2, 0.5, 1, 1e-3) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - @testset "starting step" begin - start = 4 - o = ExpDecay(0.2, 0.5, 1, 1e-3, start) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - w = randn(10, 10) - o = ExpDecay(0.1, 0.1, 1000, 1e-4) - w1 = randn(10,10) - loss(x) = Flux.Losses.mse(w*x, w1*x) - flag = 1 - decay_steps = [] - for t = 1:10^5 - prev_eta = o.eta - θ = Params([w1]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - prev_grad = collect(θ̄[w1]) - delta = Optimise.apply!(o, w1, θ̄[w1]) - w1 .-= delta - new_eta = o.eta - if new_eta != prev_eta - push!(decay_steps, t) - end - array = fill(o.eta, size(prev_grad)) - if array .* prev_grad != delta - flag = 0 - end - end - @test flag == 1 - # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). - ground_truth = [] - for i in 1:4 - push!(ground_truth, 1000*i) # Expected decay steps for this example. - end - @test decay_steps == ground_truth - @test o.eta == o.clip -end - -@testset "Clipping" begin - w = randn(10, 10) - loss(x) = sum(w * x) - θ = Params([w]) - x = 1000 * randn(10) - w̄ = gradient(() -> loss(x), θ)[w] - w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄)) - @test all(w̄_value .<= 1) - w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄)) - @test norm(w̄_norm) <= 1 -end - -@testset "update!: handle Fills from Zygote" begin - w = randn(10,10) - wold = copy(w) - g = FillArrays.Ones(size(w)) - opt = Descent(0.1) - Flux.update!(opt, w, g) - @test w ≈ wold .- 0.1 - - w = randn(3) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> w[1], θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w[1] ≈ wold[1] .- 0.1 - @test w[2:3] ≈ wold[2:3] - - ## Issue #1510 - w = randn(10,10) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 -end - -@testset "update!: handle ComponentArrays" begin - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - opt_state = Optimisers.setup(Optimisers.Descent(0.1), w) - gs = gradient(w -> w.a + sum(w.c.b), w)[1] - Flux.update!(opt_state, w, gs) - @test w.a ≈ wold.a - 0.1 - @test w.b ≈ wold.b - @test w.c.b ≈ wold.c.b .- 0.1 - @test w.c.a ≈ wold.c.a - - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - opt_state = Optimisers.setup(Optimisers.Descent(0.1), w) - gs = gradient(w -> sum(w), w)[1] - Flux.update!(opt_state, w, gs) - @test w ≈ wold .- 0.1 -end - -# Flux PR #1776 -# We need to test that optimisers like Adam that maintain an internal momentum -# estimate properly calculate the second-order statistics on the gradients as -# the flow backward through the model. Previously, we would calculate second- -# order statistics via `Δ^2` rather than the complex-aware `Δ * conj(Δ)`, which -# wreaks all sorts of havoc on our training loops. This test ensures that -# a simple optimization is montonically decreasing (up to learning step effects) -@testset "Momentum Optimisers and complex values" begin - # Test every optimiser that has momentum internally - for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] - # Our "model" is just a complex number - w = zeros(ComplexF32, 1) - - # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` - function loss() - # Deterministic training data is the best training data - x = ones(1, 1) + 1im*ones(1, 1) - - # Manually implement `mse()` to allow demonstration of brokenness - # on older Flux builds that don't have a fixed `mse()` - return sum(abs2.(w * x .- conj(x))) - end - - params = Flux.Params([w]) - opt = opt_ctor(1e-2) - - # Train for 10 iterations, enforcing that loss is monotonically decreasing - last_loss = Inf - for idx in 1:10 - grads = Flux.gradient(loss, params) - @test loss() < last_loss - last_loss = loss() - Flux.update!(opt, params, grads) - end - end -end diff --git a/test/outputsize.jl b/test/outputsize.jl index 55cb823c5c..fe217c0fc9 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -248,7 +248,7 @@ end @test string(ld) == "LazyLayer(Dense(2 => 3, relu))" @test Flux.striplazy(ld) isa Dense - @test_throws Exception Flux.params(lm) + @test_throws Exception Flux.trainables(lm) @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2]) @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld) diff --git a/test/runtests.jl b/test/runtests.jl index ff6660be14..6f5a2e7d84 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,5 @@ using Flux using Flux: OneHotArray, OneHotMatrix, OneHotVector -using Flux: params using Test using Random, Statistics, LinearAlgebra using IterTools: ncycle @@ -32,8 +31,7 @@ Random.seed!(0) include("loading.jl") end - @testset "Optimise / Train" begin - include("optimise.jl") + @testset "Train" begin include("train.jl") include("tracker.jl") end diff --git a/test/test_utils.jl b/test/test_utils.jl index 25a4f1af47..c736943f1c 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -19,7 +19,6 @@ function finitediff_withgradient(f, x...) return y, FiniteDifferences.grad(fdm, f, x...) end - function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4) fmapstructure_with_path(a, b) do kp, x, y if x isa AbstractArray @@ -30,7 +29,6 @@ function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4) end end - function test_gradients( f, xs...; diff --git a/test/train.jl b/test/train.jl index 5a1fd0592e..a021b6f22a 100644 --- a/test/train.jl +++ b/test/train.jl @@ -90,38 +90,28 @@ end @testset "Explicit Flux.update! features" begin m = Chain(Dense(2=>3, tanh), Dense(3=>1), only) - x = rand(2) + x = rand(Float32, 2) y1 = m(x) # before - # Implicit gradient - gold = Zygote.gradient(() -> m(x), Flux.params(m)) - @test gold isa Flux.Zygote.Grads - @test_throws ErrorException Flux.update!(Flux.Adam(), m, gold) # friendly - Flux.update!(Flux.Adam(), Flux.params(m), gold) - y2 = m(x) - @test y2 < y1 - # Explicit gradient gs = Zygote.gradient(marg -> marg(x), m) @test gs isa Tuple - @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs) # friendly - @test_throws ErrorException Flux.update!(Flux.Adam(), Flux.params(m), gs[1]) # friendly @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs) # friendly @test_throws ErrorException Flux.update!(Flux.Adam(), m, gs[1]) # friendly s = Flux.setup(Adam(), m) @info "ignore this warning, just testing an upgrade path:" Flux.update!(s, m, gs) # Chain + Tuple can be unambiguously sorted out + y2 = m(x) + @test y2 < y1 + Flux.update!(s, m, gs[1]) # finally, this is the correct thing y3 = m(x) @test y3 < y2 - Flux.update!(s, m, gs[1]) # finally, this is the correct thing - y4 = m(x) - @test y4 < y3 # Also check that if you import the new Adam, then Flux.setup does still work! s2 = Flux.setup(Optimisers.Adam(), m) Flux.update!(s2, m, gs[1]) - y5 = m(x) - @test y5 < y4 + y4 = m(x) + @test y4 < y3 end for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme")) @@ -147,25 +137,21 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme")) end diff1 = model.weight .- init_weight - # Take 2: the same, but with Flux.params. Was broken for a bit, no tests! - # skipping this test for Enzyme cause implicit params is unsupported - if name == "Zygote" - model.weight .= init_weight - model.bias .= 0 - pen2(x::AbstractArray) = sum(abs2, x)/2 - opt = Flux.setup(Adam(0.1), model) - - trainfn!(model, data, opt) do m, x, y - err = Flux.mse(m(x), y) - l2 = sum(pen2, Flux.params(m)) - err + 0.33 * l2 - end - - diff2 = model.weight .- init_weight - @test diff1 ≈ diff2 + # Take 2: the same, but with Optimisers.trainables. + model.weight .= init_weight + model.bias .= 0 + pen2(x::AbstractArray) = sum(abs2, x)/2 + opt = Flux.setup(Adam(0.1), model) + trainfn!(model, data, opt) do m, x, y + err = Flux.mse(m(x), y) + l2 = sum(pen2, Flux.trainables(m)) + err + 0.33 * l2 end + diff2 = model.weight .- init_weight + @test diff1 ≈ diff2 + # Take 3: using WeightDecay instead. Need the /2 above, to match exactly. model.weight .= init_weight model.bias .= 0 diff --git a/test/utils.jl b/test/utils.jl index dc46d68255..6b0a16bcf3 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -2,7 +2,7 @@ using Flux using Flux: throttle, nfan, glorot_uniform, glorot_normal, kaiming_normal, kaiming_uniform, orthogonal, truncated_normal, lecun_normal, sparse_init, identity_init, unstack, batch, unbatch, - unsqueeze, params, loadmodel! + unsqueeze, loadmodel! using MLUtils using Statistics, LinearAlgebra using Random @@ -255,38 +255,32 @@ end end end -@testset "Params" begin +@testset "Trainables" begin m = Dense(10 => 5) - @test size.(params(m)) == [(5, 10), (5,)] + @test size.(Flux.trainables(m)) == [(5, 10), (5,)] m = RNN(10 => 5) - @test size.(params(m)) == [(5, 10), (5, 5), (5,)] + @test size.(Flux.trainables(m)) == [(5, 10), (5, 5), (5,)] # Layer duplicated in same chain, params just once pls. c = Chain(m, m) - @test size.(params(c)) == [(5, 10), (5, 5), (5,)] + @test size.(Flux.trainables(c)) == [(5, 10), (5, 5), (5,)] # Self-referential array. Just want params, no stack overflow pls. r = Any[nothing,m] r[1] = r - @test size.(params(r)) == [(5, 10), (5, 5), (5,)] + @test_broken size.(Flux.trainables(r)) == [(5, 10), (5, 5), (5,)] # Ensure functor explores inside Transpose but not SubArray m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi)) - @test size.(Flux.params(m)) == [(2,), (1, 2)] + @test size.(Flux.trainables(m)) == [(2,), (1, 2)] end -@testset "params gradient" begin +@testset "trainables gradient" begin m = (x=[1,2.0], y=[3.0]); - # Explicit -- was broken by #2054 - gnew = gradient(m -> (sum(norm, Flux.params(m))), m)[1] + gnew = gradient(m -> (sum(norm, Flux.trainables(m))), m)[1] @test gnew.x ≈ [0.4472135954999579, 0.8944271909999159] @test gnew.y ≈ [1.0] - - # Implicit - gold = gradient(() -> (sum(norm, Flux.params(m))), Flux.params(m)) - @test gold[m.x] ≈ [0.4472135954999579, 0.8944271909999159] - @test gold[m.y] ≈ [1.0] end @testset "Precision" begin @@ -345,28 +339,12 @@ end o = ones(s) z = zeros(s) - @testset "Explicit" begin - gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...) - g = gfun(o, z) - @test gfun(o, false) == (g[1], nothing) - - g = gfun(z, o) - @test gfun(false, o) == (nothing, g[2]) - end - - @testset "Implicit" begin - gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args))) - g = gfun(o, z) - - gres = gfun(o, false) - @test gres[o] == g[o] - @test false ∉ gres.params + gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...) + g = gfun(o, z) + @test gfun(o, false) == (g[1], nothing) - g = gfun(z, o) - gres = gfun(false, o) - @test gres[o] == g[o] - @test false ∉ gres.params - end + g = gfun(z, o) + @test gfun(false, o) == (nothing, g[2]) end end @@ -566,10 +544,10 @@ end @testset "Shared parameters" begin mat = [1 2; 3 4.0] simple = ((nothing, mat, (3, mat, 4))) - @test length(Flux.params(simple)) == 1 + @test length(Flux.trainables(simple)) == 1 oneadj = (nt = (m = mat, a = mat')) - @test length(Flux.params(oneadj)) == 1 # needs Functors@0.3 + @test length(Flux.trainables(oneadj)) == 1 # needs Functors@0.3 @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4] end @@ -631,13 +609,13 @@ end model = Model(d, d) # Works - g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model)) + g1 = Flux.gradient(m -> sum(m(x)), model)[1] p, re = Flux.destructure(model) # Fails - g2 = Flux.gradient(p -> sum(re(p)(x)), p) + g2 = Flux.gradient(p -> sum(re(p)(x)), p)[1] - @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias]) + @test g2 ≈ vcat(g1.a.weight + g1.b.weight, g1.a.bias + g1.b.bias) end @testset "issue 1826" begin