From 3cea17ea494faea019c6c2988f6796f7d4f37362 Mon Sep 17 00:00:00 2001 From: Saransh Chopra Date: Sun, 16 Oct 2022 14:28:06 +0530 Subject: [PATCH] Massive reformat --- docs/make.jl | 117 +-- perf/bench_utils.jl | 25 +- perf/conv.jl | 6 +- perf/dense.jl | 4 +- perf/recurrent.jl | 78 +- perf/vgg.jl | 82 +- src/Flux.jl | 17 +- src/cuda/cudnn.jl | 32 +- src/deprecations.jl | 71 +- src/functor.jl | 99 +-- src/layers/basic.jl | 417 +++++----- src/layers/conv.jl | 509 ++++++------ src/layers/normalise.jl | 472 ++++++------ src/layers/recurrent.jl | 320 ++++---- src/layers/show.jl | 173 +++-- src/layers/stateless.jl | 26 +- src/layers/upsample.jl | 74 +- src/loading.jl | 105 +-- src/losses/Losses.jl | 20 +- src/losses/functions.jl | 145 ++-- src/losses/utils.jl | 27 +- src/optimise/Optimise.jl | 8 +- src/optimise/optimisers.jl | 463 +++++------ src/optimise/train.jl | 105 +-- src/outputsize.jl | 224 +++--- src/utils.jl | 444 ++++++----- test/ctc-gpu.jl | 83 +- test/ctc.jl | 67 +- test/cuda/cuda.jl | 260 +++---- test/cuda/curnn.jl | 38 +- test/cuda/layers.jl | 436 +++++------ test/cuda/losses.jl | 69 +- test/cuda/runtests.jl | 8 +- test/cuda/test_utils.jl | 111 +-- test/data.jl | 56 +- test/layers/basic.jl | 680 ++++++++-------- test/layers/conv.jl | 462 +++++------ test/layers/normalisation.jl | 702 ++++++++--------- test/layers/recurrent.jl | 233 +++--- test/layers/show.jl | 104 ++- test/layers/stateless.jl | 16 +- test/layers/upsample.jl | 134 ++-- test/losses.jl | 271 +++---- test/optimise.jl | 344 ++++----- test/outputsize.jl | 427 +++++----- test/runtests.jl | 81 +- test/utils.jl | 1410 +++++++++++++++++----------------- 47 files changed, 5189 insertions(+), 4866 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 40d6033637..a6c6659cde 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,62 +1,67 @@ -using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore - +using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, + ChainRulesCore DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true) -makedocs( - modules = [Flux, NNlib, Functors, MLUtils, BSON, Optimisers, OneHotArrays, Zygote, ChainRulesCore, Base], - doctest = false, - sitename = "Flux", - # strict = [:cross_references,], - pages = [ - "Getting Started" => [ - "Welcome" => "index.md", - "Quick Start" => "models/quickstart.md", - "Fitting a Line" => "models/overview.md", - "Gradients and Layers" => "models/basics.md", - ], - "Building Models" => [ - "Built-in Layers 📚" => "models/layers.md", - "Recurrence" => "models/recurrence.md", - "Activation Functions 📚" => "models/activation.md", - "NNlib.jl 📚 (`softmax`, `conv`, ...)" => "models/nnlib.md", - ], - "Handling Data" => [ - "MLUtils.jl 📚 (`DataLoader`, ...)" => "data/mlutils.md", - "OneHotArrays.jl 📚 (`onehot`, ...)" => "data/onehot.md", - ], - "Training Models" => [ - "Training" => "training/training.md", - "Regularisation" => "models/regularisation.md", - "Loss Functions 📚" => "models/losses.md", - "Optimisation Rules 📚" => "training/optimisers.md", # TODO move optimiser intro up to Training - "Callback Helpers 📚" => "training/callbacks.md", - "Zygote.jl 📚 (`gradient`, ...)" => "training/zygote.md", - ], - "Model Tools" => [ - "GPU Support" => "gpu.md", - "Saving & Loading" => "saving.md", - "Shape Inference 📚" => "outputsize.md", - "Weight Initialisation 📚" => "utilities.md", - "Flat vs. Nested 📚" => "destructure.md", - "Functors.jl 📚 (`fmap`, ...)" => "models/functors.md", +makedocs(modules = [ + Flux, + NNlib, + Functors, + MLUtils, + BSON, + Optimisers, + OneHotArrays, + Zygote, + ChainRulesCore, + Base, ], - "Performance Tips" => "performance.md", - "Flux's Ecosystem" => "ecosystem.md", - "Tutorials" => [ # TODO, maybe - "Custom Layers" => "models/advanced.md", # TODO move freezing to Training + doctest = false, + sitename = "Flux", + # strict = [:cross_references,], + pages = [ + "Getting Started" => [ + "Welcome" => "index.md", + "Quick Start" => "models/quickstart.md", + "Fitting a Line" => "models/overview.md", + "Gradients and Layers" => "models/basics.md", + ], + "Building Models" => [ + "Built-in Layers 📚" => "models/layers.md", + "Recurrence" => "models/recurrence.md", + "Activation Functions 📚" => "models/activation.md", + "NNlib.jl 📚 (`softmax`, `conv`, ...)" => "models/nnlib.md", + ], + "Handling Data" => [ + "MLUtils.jl 📚 (`DataLoader`, ...)" => "data/mlutils.md", + "OneHotArrays.jl 📚 (`onehot`, ...)" => "data/onehot.md", + ], + "Training Models" => [ + "Training" => "training/training.md", + "Regularisation" => "models/regularisation.md", + "Loss Functions 📚" => "models/losses.md", + "Optimisation Rules 📚" => "training/optimisers.md", # TODO move optimiser intro up to Training + "Callback Helpers 📚" => "training/callbacks.md", + "Zygote.jl 📚 (`gradient`, ...)" => "training/zygote.md", + ], + "Model Tools" => [ + "GPU Support" => "gpu.md", + "Saving & Loading" => "saving.md", + "Shape Inference 📚" => "outputsize.md", + "Weight Initialisation 📚" => "utilities.md", + "Flat vs. Nested 📚" => "destructure.md", + "Functors.jl 📚 (`fmap`, ...)" => "models/functors.md", + ], + "Performance Tips" => "performance.md", + "Flux's Ecosystem" => "ecosystem.md", + "Tutorials" => [ # TODO, maybe + "Custom Layers" => "models/advanced.md", # TODO move freezing to Training + ], ], - ], - format = Documenter.HTML( - sidebar_sitename = false, - analytics = "UA-36890222-9", - assets = ["assets/flux.css"], - prettyurls = get(ENV, "CI", nothing) == "true" - ), -) + format = Documenter.HTML(sidebar_sitename = false, + analytics = "UA-36890222-9", + assets = ["assets/flux.css"], + prettyurls = get(ENV, "CI", nothing) == "true")) -deploydocs( - repo = "github.com/FluxML/Flux.jl.git", - target = "build", - push_preview = true -) +deploydocs(repo = "github.com/FluxML/Flux.jl.git", + target = "build", + push_preview = true) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index 525184f773..d7897851a4 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -3,36 +3,39 @@ using Flux using CUDA using Zygote: pullback, ignore - fw(m, x) = m(x) -bw(back) = back(1f0) +bw(back) = back(1.0f0) fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps) pb(m, ps, x) = pullback(() -> sum(fw(m, x)), ps) -function run_benchmark(model, x; cuda=true) - - if cuda +function run_benchmark(model, x; cuda = true) + if cuda model = model |> gpu x = x |> gpu end ps = Flux.params(model) - y, back = pb(model, ps, x) - + y, back = pb(model, ps, x) if cuda CUDA.allowscalar(false) # CUDA.device!(3) println(" forward") - fw(model, x); GC.gc(); CUDA.reclaim(); #warmup + fw(model, x) + GC.gc() + CUDA.reclaim() #warmup @btime CUDA.@sync(fw($model, $x)) teardown=(GC.gc(); CUDA.reclaim()) println(" backward") - bw(back); GC.gc(); CUDA.reclaim(); #warmup + bw(back) + GC.gc() + CUDA.reclaim() #warmup @btime CUDA.@sync(bw($back)) teardown=(GC.gc(); CUDA.reclaim()) - + println(" forw and back") - fwbw(model, ps, x); GC.gc(); CUDA.reclaim(); #warmup + fwbw(model, ps, x) + GC.gc() + CUDA.reclaim() #warmup @btime CUDA.@sync(fwbw($model, $ps, $x)) teardown=(GC.gc(); CUDA.reclaim()) else println(" forward") diff --git a/perf/conv.jl b/perf/conv.jl index 8da601e480..98dfcf46ce 100644 --- a/perf/conv.jl +++ b/perf/conv.jl @@ -1,8 +1,8 @@ for ch in [1, 3, 16, 64] x = rand(Float32, 64, 64, ch, 64) - model = Conv((3,3), ch=>ch) + model = Conv((3, 3), ch => ch) println("CPU ch=$ch") - run_benchmark(model, x, cuda=false) + run_benchmark(model, x, cuda = false) println("CUDA ch=$ch") - run_benchmark(model, x, cuda=true) + run_benchmark(model, x, cuda = true) end diff --git a/perf/dense.jl b/perf/dense.jl index 005d9360ba..1f77d21c55 100644 --- a/perf/dense.jl +++ b/perf/dense.jl @@ -2,7 +2,7 @@ for n in [2, 20, 200, 2000] x = randn(Float32, n, n) model = Dense(n, n) println("CPU n=$n") - run_benchmark(model, x, cuda=false) + run_benchmark(model, x, cuda = false) println("CUDA n=$n") - run_benchmark(model, x, cuda=true) + run_benchmark(model, x, cuda = true) end diff --git a/perf/recurrent.jl b/perf/recurrent.jl index ef00a8d9a5..ae8f68d0d6 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -1,62 +1,62 @@ - struct RNNWrapper{T} - rnn::T + rnn::T end Flux.@functor RNNWrapper # Need to specialize for RNNWrapper. fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin - Flux.reset!(r.rnn) - [r.rnn(x) for x in X] + Flux.reset!(r.rnn) + [r.rnn(x) for x in X] end fw(r::RNNWrapper, X) = begin - Flux.reset!(r.rnn) - r.rnn(X) + Flux.reset!(r.rnn) + r.rnn(X) end -fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do - y = fw(r, X) - sum(sum(y)) -end +fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = + gradient(ps) do + y = fw(r, X) + return sum(sum(y)) + end -pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do - y = fw(r, X) - sum(sum(y)) -end +pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = + pullback(ps) do + y = fw(r, X) + return sum(sum(y)) + end function rnn_benchmark_sweep(data_creator::Function, rnn_type) - for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64] - x, x_n = data_creator(n, ts) - model = RNNWrapper(rnn_type(n, n)) - - println("$rnn_type $x_n CPU n=$n, ts=$ts") - run_benchmark(model, x, cuda=false) - - println("$rnn_type $x_n CUDA n=$n, ts=$ts") - try - run_benchmark(model, x, cuda=true) - catch ex - @show typeof(ex) - if ex isa OutOfGPUMemoryError - @warn "Not enough GPU memory to run test" - else - rethrow(ex) - end + for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64] + x, x_n = data_creator(n, ts) + model = RNNWrapper(rnn_type(n, n)) + + println("$rnn_type $x_n CPU n=$n, ts=$ts") + run_benchmark(model, x, cuda = false) + + println("$rnn_type $x_n CUDA n=$n, ts=$ts") + try + run_benchmark(model, x, cuda = true) + catch ex + @show typeof(ex) + if ex isa OutOfGPUMemoryError + @warn "Not enough GPU memory to run test" + else + rethrow(ex) + end + end end - end end for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] - rnn_benchmark_sweep(rnn_type) do n, ts - [randn(Float32, n, n) for _ in 1:ts], "Vec" - end + rnn_benchmark_sweep(rnn_type) do n, ts + return [randn(Float32, n, n) for _ in 1:ts], "Vec" + end end for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] - rnn_benchmark_sweep(rnn_type) do n, ts - randn(Float32, n, n, ts), "Block" - end + rnn_benchmark_sweep(rnn_type) do n, ts + return randn(Float32, n, n, ts), "Block" + end end - diff --git a/perf/vgg.jl b/perf/vgg.jl index 708c152c90..13ac8e8c77 100644 --- a/perf/vgg.jl +++ b/perf/vgg.jl @@ -6,50 +6,48 @@ using CUDA using Zygote: pullback function vgg16() - Chain( - Conv((3, 3), 3 => 64, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(64), - Conv((3, 3), 64 => 64, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(64), - MaxPool((2,2)), - Conv((3, 3), 64 => 128, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(128), - Conv((3, 3), 128 => 128, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(128), - MaxPool((2,2)), - Conv((3, 3), 128 => 256, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(256), - Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(256), - MaxPool((2,2)), - Conv((3, 3), 256 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - MaxPool((2,2)), - Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)), - BatchNorm(512), - MaxPool((2,2)), - flatten, - Dense(512, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dropout(0.5), - Dense(4096, 10) - ) + return Chain(Conv((3, 3), 3 => 64, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + Conv((3, 3), 64 => 64, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(64), + MaxPool((2, 2)), + Conv((3, 3), 64 => 128, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + Conv((3, 3), 128 => 128, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(128), + MaxPool((2, 2)), + Conv((3, 3), 128 => 256, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + Conv((3, 3), 256 => 256, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(256), + MaxPool((2, 2)), + Conv((3, 3), 256 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + Conv((3, 3), 512 => 512, relu, pad = (1, 1), stride = (1, 1)), + BatchNorm(512), + MaxPool((2, 2)), + flatten, + Dense(512, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dropout(0.5), + Dense(4096, 10)) end -let model=vgg16(), x=rand(Float32, 32, 32, 3, 64) +let model = vgg16(), x = rand(Float32, 32, 32, 3, 64) println("CPU benchmark") - run_benchmark(model, x, cuda=false) + run_benchmark(model, x, cuda = false) println("CUDA benchmark") - run_benchmark(model, x, cuda=true) + run_benchmark(model, x, cuda = true) end diff --git a/src/Flux.jl b/src/Flux.jl index fcb473ba2c..251c472fd2 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -14,7 +14,9 @@ using Zygote: Params, @adjoint, gradient, pullback, @nograd export gradient # Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.) -Optimisers.base(dx::Zygote.Grads) = error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`") +function Optimisers.base(dx::Zygote.Grads) + return error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`") +end export Chain, Dense, Embedding, Maxout, SkipConnection, Parallel, PairwiseFusion, RNN, LSTM, GRU, GRUv3, @@ -30,19 +32,21 @@ using .Optimise using .Optimise: @epochs using .Optimise: skip export Descent, Adam, Momentum, Nesterov, RMSProp, - AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam, - AdamW, RAdam, AdaBelief, InvDecay, ExpDecay, - WeightDecay, ClipValue, ClipNorm + AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, OAdam, + AdamW, RAdam, AdaBelief, InvDecay, ExpDecay, + WeightDecay, ClipValue, ClipNorm using CUDA -const use_cuda = Ref{Union{Nothing,Bool}}(nothing) +const use_cuda = Ref{Union{Nothing, Bool}}(nothing) using Adapt, Functors, OneHotArrays include("utils.jl") include("functor.jl") # Pirate error to catch a common mistake. -Functors.functor(::Type{<:MLUtils.DataLoader}, x) = error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.") +function Functors.functor(::Type{<:MLUtils.DataLoader}, x) + return error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.") +end include("layers/stateless.jl") include("layers/basic.jl") @@ -60,7 +64,6 @@ export @autosize include("data/Data.jl") using .Data - include("losses/Losses.jl") using .Losses # TODO: stop importing Losses in Flux's namespace in v0.12 diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl index 9e6bdb53a0..c20a7f873c 100644 --- a/src/cuda/cudnn.jl +++ b/src/cuda/cudnn.jl @@ -1,21 +1,21 @@ import NNlibCUDA: batchnorm, ∇batchnorm -function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}}, - cache=nothing) where T<:Union{Float32, Float64} - - @assert BN.affine "BatchNorm: only affine=true supported on gpu" - @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" - @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wrong number of channels" - return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; - cache=cache, alpha=1, beta=0, eps=BN.ϵ, - training=Flux._isactive(BN))) +function (BN::Flux.BatchNorm)(x::Union{CuArray{T, 2}, CuArray{T, 4}, CuArray{T, 5}}, + cache = nothing) where {T <: Union{Float32, Float64}} + @assert BN.affine "BatchNorm: only affine=true supported on gpu" + @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu" + @assert length(BN.β)==size(x, ndims(x) - 1) "BatchNorm: input has wrong number of channels" + return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; + cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, + training = Flux._isactive(BN))) end -function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var, momentum; kw...) - y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) - function batchnorm_pullback(Δ) - grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...) - (NoTangent(), grad..., NoTangent(), NoTangent(), NoTangent()) - end - y, batchnorm_pullback +function ChainRulesCore.rrule(::typeof(batchnorm), g, b, x, running_mean, running_var, + momentum; kw...) + y = batchnorm(g, b, x, running_mean, running_var, momentum; kw...) + function batchnorm_pullback(Δ) + grad = ∇batchnorm(g, b, x, unthunk(Δ), running_mean, running_var, momentum; kw...) + return (NoTangent(), grad..., NoTangent(), NoTangent(), NoTangent()) + end + return y, batchnorm_pullback end diff --git a/src/deprecations.jl b/src/deprecations.jl index 8c3bc963a4..acb87b9625 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -1,69 +1,80 @@ # v0.12 deprecations function ones(dims...) - Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones, force=true) - Base.ones(Float32, dims...) + Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", + :ones, force = true) + return Base.ones(Float32, dims...) end ones(T::Type, dims...) = Base.ones(T, dims...) function zeros(dims...) - Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", :zeros, force=true) - Base.zeros(Float32, dims...) + Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", + :zeros, force = true) + return Base.zeros(Float32, dims...) end zeros(T::Type, dims...) = Base.zeros(T, dims...) -ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type")) -zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type")) +function ones32(::Type, dims...) + throw(ArgumentError("Flux.ones32 is always Float32, use Base.ones to specify the element type")) +end +function zeros32(::Type, dims...) + throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type")) +end # v0.13 deprecations function Broadcast.broadcasted(f::Recur, args...) - # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12 - Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. - Re-writing this as a comprehension would be better.""", :broadcasted) - map(f, args...) # map isn't really safe either, but + # This had an explicit @adjoint rule, calling Zygote.∇map(__context__, f, args...), until v0.12 + Base.depwarn("""Broadcasting is not safe to use with RNNs, as it does not guarantee an iteration order. + Re-writing this as a comprehension would be better.""", :broadcasted) + return map(f, args...) # map isn't really safe either, but end @deprecate frequencies(xs) group_counts(xs) struct Zeros - function Zeros() - Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", :Zeros) - false - end + function Zeros() + Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", + :Zeros) + return false + end end Zeros(args...) = Zeros() # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros()) function Optimise.update!(x::AbstractArray, x̄) - Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", :update!) - x .-= x̄ + Base.depwarn("`Flux.Optimise.update!(x, x̄)` was not used internally and has been removed. Please write `x .-= x̄` instead.", + :update!) + return x .-= x̄ end function Diagonal(size::Integer...; kw...) - Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal) - Scale(size...; kw...) + Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal) + return Scale(size...; kw...) end function Diagonal(size::Tuple; kw...) - Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal) - Scale(size...; kw...) + Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", + :Diagonal) + return Scale(size...; kw...) end # Deprecate this eventually once saving models w/o structure is no more function loadparams!(m, xs) - Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", :loadparams!) - for (p, x) in zip(params(m), xs) - size(p) == size(x) || - error("Expected param size $(size(p)), got $(size(x))") - copyto!(p, x) - end + Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", + :loadparams!) + for (p, x) in zip(params(m), xs) + size(p) == size(x) || + error("Expected param size $(size(p)), got $(size(x))") + copyto!(p, x) + end end # Channel notation: Changed to match Conv, but very softly deprecated! # Perhaps change to @deprecate for v0.14, but there is no plan to remove these. -Dense(in::Integer, out::Integer, σ = identity; kw...) = - Dense(in => out, σ; kw...) -Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity; kw...) = - Bilinear((in1, in2) => out, σ; kw...) +Dense(in::Integer, out::Integer, σ = identity; kw...) = Dense(in => out, σ; kw...) +function Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity; kw...) + return Bilinear((in1, in2) => out, σ; kw...) +end Embedding(in::Integer, out::Integer; kw...) = Embedding(in => out; kw...) RNNCell(in::Integer, out::Integer, σ = tanh; kw...) = RNNCell(in => out, σ; kw...) diff --git a/src/functor.jl b/src/functor.jl index 13adbe13ff..993ea95693 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -1,5 +1,5 @@ import Adapt: adapt, adapt_storage -using LinearAlgebra: Cholesky +using LinearAlgebra: Cholesky using Zygote: IdSet import Functors: Functors, @functor, functor, fmap, isleaf using SparseArrays: AbstractSparseArray @@ -14,9 +14,10 @@ _Note_: if you manually set a model into test mode, you need to manually place it back into train mode during training phase. Possible values include: -- `false` for training -- `true` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically + + - `false` for training + - `true` for testing + - `:auto` or `nothing` for Flux to detect the mode automatically """ testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m) @@ -30,23 +31,24 @@ _Note_: if you manually set a model into train mode, you need to manually place it into test mode during testing phase. Possible values include: -- `true` for training -- `false` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically + + - `true` for training + - `false` for testing + - `:auto` or `nothing` for Flux to detect the mode automatically """ trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode) function params!(p::Params, x, seen = IdSet()) - if x isa AbstractArray{<:Number} && Functors.isleaf(x) - return push!(p, x) - elseif x in seen - nothing - else - push!(seen, x) - for child in trainable(x) - params!(p, child, seen) + if x isa AbstractArray{<:Number} && Functors.isleaf(x) + return push!(p, x) + elseif x in seen + nothing + else + push!(seen, x) + for child in trainable(x) + params!(p, child, seen) + end end - end end """ @@ -60,10 +62,11 @@ This can be used with the `gradient` function, see [Taking Gradients](@ref), or The behaviour of `params` on custom types can be customized using [`Functors.@functor`](@ref) or [`Flux.trainable`](@ref). # Examples + ```jldoctest julia> using Flux: params -julia> params(Chain(Dense(ones(2,3)), softmax)) # unpacks Flux models +julia> params(Chain(Dense(ones(2, 3)), softmax)) # unpacks Flux models Params([[1.0 1.0 1.0; 1.0 1.0 1.0], [0.0, 0.0]]) julia> bn = BatchNorm(2, relu) @@ -78,14 +81,14 @@ Params([[1, 2, 3], [4]]) julia> params([[1, 2, 3], [4]]) # unpacks array of arrays Params([[1, 2, 3], [4]]) -julia> params(1, [2 2], (alpha=[3,3,3], beta=Ref(4), gamma=sin)) # ignores scalars, unpacks NamedTuples +julia> params(1, [2 2], (alpha = [3, 3, 3], beta = Ref(4), gamma = sin)) # ignores scalars, unpacks NamedTuples Params([[2 2], [3, 3, 3]]) ``` """ function params(m...) - ps = Params() - params!(ps, m) - return ps + ps = Params() + params!(ps, m) + return ps end # Allows caching of the parameters when params is called within gradient() to fix #2040. @@ -95,13 +98,14 @@ struct FluxCUDAAdaptor end adapt_storage(to::FluxCUDAAdaptor, x) = CUDA.cu(x) adapt_storage(to::FluxCUDAAdaptor, x::Zygote.FillArrays.AbstractFill) = CUDA.cu(collect(x)) if VERSION >= v"1.7" - adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng() + adapt_storage(to::FluxCUDAAdaptor, x::Random.TaskLocalRNG) = CUDA.default_rng() else - adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng() + adapt_storage(to::FluxCUDAAdaptor, x::Random._GLOBAL_RNG) = CUDA.default_rng() end adapt_storage(to::FluxCUDAAdaptor, x::CUDA.RNG) = x -adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) = - error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().") +function adapt_storage(to::FluxCUDAAdaptor, x::AbstractRNG) + return error("Cannot map RNG of type $(typeof(x)) to GPU. GPU execution only supports Random.default_rng().") +end # TODO: figure out the correct design for OneElement adapt_storage(to::FluxCUDAAdaptor, x::Zygote.OneElement) = CUDA.cu(collect(x)) @@ -112,18 +116,23 @@ struct FluxCPUAdaptor end adapt_storage(to::FluxCPUAdaptor, x::AbstractArray) = adapt(Array, x) adapt_storage(to::FluxCPUAdaptor, x::AbstractRange) = x adapt_storage(to::FluxCPUAdaptor, x::Zygote.FillArrays.AbstractFill) = x -adapt_storage(to::FluxCPUAdaptor, x::T) where T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix = adapt(Array, x) +function adapt_storage(to::FluxCPUAdaptor, + x::T) where {T <: CUDA.CUSPARSE.CUDA.CUSPARSE.AbstractCuSparseMatrix} + return adapt(Array, x) +end adapt_storage(to::FluxCPUAdaptor, x::Zygote.OneElement) = x adapt_storage(to::FluxCPUAdaptor, x::AbstractSparseArray) = x adapt_storage(to::FluxCPUAdaptor, x::CUDA.RNG) = Random.default_rng() adapt_storage(to::FluxCPUAdaptor, x::AbstractRNG) = x function ChainRulesCore.rrule(::Type{Array}, x::CUDA.CuArray) - Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx)),) + return Array(x), dx -> (NoTangent(), CUDA.cu(unthunk(dx))) end -function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, x::CUDA.AbstractGPUArray) - adapt_storage(to, x), dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx)),) +function ChainRulesCore.rrule(::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, + x::CUDA.AbstractGPUArray) + return adapt_storage(to, x), + dx -> (NoTangent(), NoTangent(), adapt_storage(FluxCUDAAdaptor(), unthunk(dx))) end # CPU/GPU movement conveniences @@ -135,7 +144,7 @@ Moves `m` onto the CPU, the opposite of [`gpu`](@ref). Recurses into structs marked [`@functor`](@ref). ```julia-repl -julia> m = Dense(1,2) +julia> m = Dense(1, 2) Dense(1, 2) julia> m_gpu = gpu(m) @@ -154,7 +163,7 @@ Matrix{Float32} cpu(x) = fmap(x -> adapt(FluxCPUAdaptor(), x), x) _isbitsarray(::AbstractArray{<:Number}) = true -_isbitsarray(::AbstractArray{T}) where T = isbitstype(T) +_isbitsarray(::AbstractArray{T}) where {T} = isbitstype(T) _isbitsarray(x) = false _isleaf(::AbstractRNG) = true @@ -164,13 +173,13 @@ _isleaf(x) = _isbitsarray(x) || Functors.isleaf(x) gpu(x) Moves `m` to the current GPU device, if available. It is a no-op otherwise. -See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/) +See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/) to help identify the current device. This works for functions, and any struct marked with [`@functor`](@ref). ```julia-repl -julia> m = Dense(1,2) +julia> m = Dense(1, 2) Dense(1, 2) julia> typeof(m.W) @@ -184,21 +193,22 @@ CuArray{Float32, 2} ``` """ function gpu(x) - check_use_cuda() - use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : x + check_use_cuda() + return use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : + x end function check_use_cuda() - if use_cuda[] === nothing - use_cuda[] = CUDA.functional() - if use_cuda[] && !CUDA.has_cudnn() - @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available." - end - if !(use_cuda[]) - @info """The GPU function is being called but the GPU is not accessible. - Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1 + if use_cuda[] === nothing + use_cuda[] = CUDA.functional() + if use_cuda[] && !CUDA.has_cudnn() + @warn "CUDA.jl found cuda, but did not find libcudnn. Some functionality will not be available." + end + if !(use_cuda[]) + @info """The GPU function is being called but the GPU is not accessible. + Defaulting back to the CPU. (No action is required if you want to run on the CPU).""" maxlog=1 + end end - end end ChainRulesCore.@non_differentiable check_use_cuda() @@ -227,4 +237,3 @@ f64(m) = paramtype(Float64, m) # Functors for certain Julia data structures @functor Cholesky trainable(c::Cholesky) = () - diff --git a/src/layers/basic.jl b/src/layers/basic.jl index 42813cb5f7..a6f27a8b06 100644 --- a/src/layers/basic.jl +++ b/src/layers/basic.jl @@ -9,7 +9,7 @@ and if names are given, `m[:name] == m[1]` etc. # Examples ```jldoctest -julia> m = Chain(x -> x^2, x -> x+1); +julia> m = Chain(x -> x^2, x -> x + 1); julia> m(5) == 26 true @@ -21,7 +21,7 @@ julia> x = rand(10, 32); julia> m(x) == m[2](m[1](x)) true -julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10 => 5, tanh)), +julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10 => 5, tanh)), dec = Dense(5 => 2)); julia> m2(x) == (m2[:dec] ∘ m2[:enc])(x) @@ -32,51 +32,57 @@ For large models, there is a special type-unstable path which can reduce compila times. This can be used by supplying a vector of layers `Chain([layer1, layer2, ...])`. This feature is somewhat experimental, beware! """ -struct Chain{T<:Union{Tuple, NamedTuple, AbstractVector}} - layers::T +struct Chain{T <: Union{Tuple, NamedTuple, AbstractVector}} + layers::T end Chain(xs...) = Chain(xs) function Chain(; kw...) - :layers in keys(kw) && throw(ArgumentError("a Chain cannot have a named layer called `layers`")) - isempty(kw) && return Chain(()) - Chain(values(kw)) + :layers in keys(kw) && + throw(ArgumentError("a Chain cannot have a named layer called `layers`")) + isempty(kw) && return Chain(()) + return Chain(values(kw)) end @forward Chain.layers Base.getindex, Base.length, Base.first, Base.last, - Base.iterate, Base.lastindex, Base.keys, Base.firstindex + Base.iterate, Base.lastindex, Base.keys, Base.firstindex @functor Chain (c::Chain)(x) = _applychain(c.layers, x) -@generated function _applychain(layers::Tuple{Vararg{<:Any,N}}, x) where {N} - symbols = vcat(:x, [gensym() for _ in 1:N]) - calls = [:($(symbols[i+1]) = layers[$i]($(symbols[i]))) for i in 1:N] - Expr(:block, calls...) +@generated function _applychain(layers::Tuple{Vararg{<:Any, N}}, x) where {N} + symbols = vcat(:x, [gensym() for _ in 1:N]) + calls = [:($(symbols[i + 1]) = layers[$i]($(symbols[i]))) for i in 1:N] + return Expr(:block, calls...) end _applychain(layers::NamedTuple, x) = _applychain(Tuple(layers), x) function _applychain(layers::AbstractVector, x) # type-unstable path, helps compile times - for f in layers - x = f(x) - end - x + for f in layers + x = f(x) + end + return x end Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i]) -Base.getindex(c::Chain{<:NamedTuple}, i::AbstractArray) = - Chain(NamedTuple{keys(c)[i]}(Tuple(c.layers)[i])) +function Base.getindex(c::Chain{<:NamedTuple}, i::AbstractArray) + return Chain(NamedTuple{keys(c)[i]}(Tuple(c.layers)[i])) +end function Base.show(io::IO, c::Chain) - print(io, "Chain(") - _show_layers(io, c.layers) - print(io, ")") + print(io, "Chain(") + _show_layers(io, c.layers) + return print(io, ")") end _show_layers(io, layers::Tuple) = join(io, layers, ", ") -_show_layers(io, layers::NamedTuple) = join(io, ["$k = $v" for (k, v) in pairs(layers)], ", ") -_show_layers(io, layers::AbstractVector) = (print(io, "["); join(io, layers, ", "); print(io, "]")) +function _show_layers(io, layers::NamedTuple) + return join(io, ["$k = $v" for (k, v) in pairs(layers)], ", ") +end +function _show_layers(io, layers::AbstractVector) + return (print(io, "["); join(io, layers, ", "); print(io, "]")) +end # This is a temporary and naive implementation # it might be replaced in the future for better performance @@ -93,7 +99,7 @@ Like calling a `Chain`, but saves the result of each layer as an output. ```jldoctest julia> using Flux: activations -julia> c = Chain(x -> x + 1, x -> x * 2, x -> x ^ 3); +julia> c = Chain(x -> x + 1, x -> x * 2, x -> x^3); julia> activations(c, 1) (2, 4, 64) @@ -103,12 +109,11 @@ activations(c::Chain, input) = _extraChain(Tuple(c.layers), input) # Calculates the forward results of each layer provided in a `Tuple` with `x` as model input. function _extraChain(fs::Tuple, x) - res = first(fs)(x) - return (res, _extraChain(Base.tail(fs), res)...) + res = first(fs)(x) + return (res, _extraChain(Base.tail(fs), res)...) end _extraChain(::Tuple{}, x) = () - """ Dense(in => out, σ=identity; bias=true, init=glorot_uniform) Dense(W::AbstractMatrix, [bias, σ]) @@ -128,6 +133,7 @@ given to keyword `init`, with default [`glorot_uniform`](@ref Flux.glorot_unifor The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly. # Examples + ```jldoctest julia> d = Dense(5 => 2) Dense(5 => 2) # 12 parameters @@ -150,36 +156,35 @@ julia> Flux.params(d1) # no trainable bias Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]]) ``` """ -struct Dense{F, M<:AbstractMatrix, B} - weight::M - bias::B - σ::F - function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix, F} - b = create_bias(W, bias, size(W,1)) - new{F,M,typeof(b)}(W, b, σ) - end +struct Dense{F, M <: AbstractMatrix, B} + weight::M + bias::B + σ::F + function Dense(W::M, bias = true, σ::F = identity) where {M <: AbstractMatrix, F} + b = create_bias(W, bias, size(W, 1)) + return new{F, M, typeof(b)}(W, b, σ) + end end function Dense((in, out)::Pair{<:Integer, <:Integer}, σ = identity; init = glorot_uniform, bias = true) - Dense(init(out, in), bias, σ) + return Dense(init(out, in), bias, σ) end @functor Dense function (a::Dense)(x::AbstractVecOrMat) - σ = NNlib.fast_act(a.σ, x) # replaces tanh => tanh_fast, etc - return σ.(a.weight * x .+ a.bias) + σ = NNlib.fast_act(a.σ, x) # replaces tanh => tanh_fast, etc + return σ.(a.weight * x .+ a.bias) end -(a::Dense)(x::AbstractArray) = - reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...) +(a::Dense)(x::AbstractArray) = reshape(a(reshape(x, size(x, 1), :)), :, size(x)[2:end]...) function Base.show(io::IO, l::Dense) - print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1)) - l.σ == identity || print(io, ", ", l.σ) - l.bias == false && print(io, "; bias=false") - print(io, ")") + print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1)) + l.σ == identity || print(io, ", ", l.σ) + l.bias == false && print(io, "; bias=false") + return print(io, ")") end """ @@ -191,14 +196,15 @@ Create an element-wise layer, whose forward pass is given by: y = σ.(scale .* x .+ bias) This uses `.*` instead of matrix multiplication `*` of [`Dense`](@ref). - + The learnable scale & bias are initialised `init(size...)` and `zeros32(size...)`, -with `init=ones32` by default. You may specify the function `init`, +with `init=ones32` by default. You may specify the function `init`, turn off trainable bias with `bias=false`, or provide the array(s) explicitly. Used by [`LayerNorm`](@ref) with `affine=true`. # Examples + ```jldoctest julia> a = Flux.Scale(2) Scale(2) # 4 parameters @@ -223,31 +229,37 @@ julia> Flux.params(b) Params([[1 2 3 4]]) ``` """ -struct Scale{F, A<:AbstractArray, B} - scale::A - bias::B - σ::F - function Scale(scale::A, bias::B = true, σ::F = identity) where {A<:AbstractArray, B<:Union{Bool, AbstractArray}, F} - b = create_bias(scale, bias, size(scale)...) - new{F, A, typeof(b)}(scale, b, σ) - end +struct Scale{F, A <: AbstractArray, B} + scale::A + bias::B + σ::F + function Scale(scale::A, bias::B = true, + σ::F = identity) where {A <: AbstractArray, + B <: Union{Bool, AbstractArray}, F} + b = create_bias(scale, bias, size(scale)...) + return new{F, A, typeof(b)}(scale, b, σ) + end end -Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = identity) = Scale(init(s1, s23...), bias, _act) -Scale(size_act...; bias = true, init = ones32) = Scale(size_act[1:end-1]...; bias, init, _act = size_act[end]) +function Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = identity) + return Scale(init(s1, s23...), bias, _act) +end +function Scale(size_act...; bias = true, init = ones32) + return Scale(size_act[1:(end - 1)]...; bias, init, _act = size_act[end]) +end @functor Scale function (a::Scale)(x::AbstractArray) - σ = NNlib.fast_act(a.σ, x) # replaces tanh => tanh_fast, etc - σ.(a.scale .* x .+ a.bias) + σ = NNlib.fast_act(a.σ, x) # replaces tanh => tanh_fast, etc + return σ.(a.scale .* x .+ a.bias) end function Base.show(io::IO, l::Scale) - print(io, "Scale(", join(size(l.scale), ", ")) - l.σ == identity || print(io, ", ", l.σ) - l.bias == false && print(io, "; bias=false") - print(io, ")") + print(io, "Scale(", join(size(l.scale), ", ")) + l.σ == identity || print(io, ", ", l.σ) + l.bias == false && print(io, "; bias=false") + return print(io, ")") end """ @@ -261,12 +273,13 @@ Instead of defining layers individually, you can provide a zero-argument functio which constructs them, and the number to construct. Maxout over linear dense layers satisfies the univeral approximation theorem. -See Goodfellow, Warde-Farley, Mirza, Courville & Bengio "Maxout Networks" +See Goodfellow, Warde-Farley, Mirza, Courville & Bengio "Maxout Networks" [https://arxiv.org/abs/1302.4389](https://arxiv.org/abs/1302.4389). See also [`Parallel`](@ref) to reduce with other operators. # Examples + ```jldoctest julia> m = Maxout(x -> abs2.(x), x -> x .* 3); @@ -285,8 +298,8 @@ julia> Flux.outputsize(m3, (5, 11)) (7, 11) ``` """ -struct Maxout{T<:Tuple} - layers::T +struct Maxout{T <: Tuple} + layers::T end Maxout(layers...) = Maxout(layers) Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...) @@ -294,18 +307,17 @@ Maxout(f::Function, n_alts::Integer) = Maxout((f() for _ in 1:n_alts)...) @functor Maxout function (mo::Maxout)(input::AbstractArray) - # Perhaps surprisingly, pairwise max broadcast is often faster, - # even with Zygote. See #698 and #1794 - mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.layers) + # Perhaps surprisingly, pairwise max broadcast is often faster, + # even with Zygote. See #698 and #1794 + return mapreduce(f -> f(input), (acc, out) -> max.(acc, out), mo.layers) end function Base.show(io::IO, mo::Maxout) - print(io, "Maxout(") - _show_layers(io, mo.layers) - print(io, ")") + print(io, "Maxout(") + _show_layers(io, mo.layers) + return print(io, ")") end - """ SkipConnection(layer, connection) @@ -317,15 +329,16 @@ will be propagated through the given `layer` while the second is the unchanged, The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`. Here is a more complicated example: + ```jldoctest -julia> m = Conv((3,3), 4 => 7, pad=(1,1)); +julia> m = Conv((3, 3), 4 => 7, pad = (1, 1)); julia> x = ones(Float32, 5, 5, 4, 10); julia> size(m(x)) == (5, 5, 7, 10) true -julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3)); +julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims = 3)); julia> size(sm(x)) == (5, 5, 11, 10) true @@ -333,19 +346,19 @@ true See also [`Parallel`](@ref), [`Maxout`](@ref). """ -struct SkipConnection{T,F} - layers::T - connection::F #user can pass arbitrary connections here, such as (a,b) -> a + b +struct SkipConnection{T, F} + layers::T + connection::F #user can pass arbitrary connections here, such as (a,b) -> a + b end @functor SkipConnection function (skip::SkipConnection)(input) - skip.connection(skip.layers(input), input) + return skip.connection(skip.layers(input), input) end function Base.show(io::IO, b::SkipConnection) - print(io, "SkipConnection(", b.layers, ", ", b.connection, ")") + return print(io, "SkipConnection(", b.layers, ", ", b.connection, ")") end """ @@ -373,6 +386,7 @@ By default the bias vector is `zeros(Float32, out)`, option `bias=false` will sw trainable bias. Either of these may be provided explicitly. # Examples + ```jldoctest julia> x, y = randn(Float32, 5, 32), randn(Float32, 5, 32); @@ -382,71 +396,76 @@ Bilinear(5 => 7) # 182 parameters julia> B(x) |> size # interactions based on one input (7, 32) -julia> B(x,y) == B((x,y)) # two inputs, may be given as a tuple +julia> B(x, y) == B((x, y)) # two inputs, may be given as a tuple true -julia> sc = SkipConnection( - Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)), - Flux.Bilinear((9, 5) => 3, bias=false), - ); # used as the recombinator, with skip as the second input +julia> sc = SkipConnection(Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)), + Flux.Bilinear((9, 5) => 3, bias = false)); # used as the recombinator, with skip as the second input julia> sc(x) |> size (3, 32) -julia> Flux.Bilinear(rand(4,8,16), false, tanh) # first dim of weight is the output +julia> Flux.Bilinear(rand(4, 8, 16), false, tanh) # first dim of weight is the output Bilinear((8, 16) => 4, tanh; bias=false) # 512 parameters ``` """ -struct Bilinear{F,A,B} - weight::A - bias::B - σ::F - function Bilinear(W::A, bias = true, σ::F = identity) where {A<:AbstractArray, F} - ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights")) - b = create_bias(W, bias, size(W,1)) - new{F,A,typeof(b)}(W, b, σ) - end +struct Bilinear{F, A, B} + weight::A + bias::B + σ::F + function Bilinear(W::A, bias = true, σ::F = identity) where {A <: AbstractArray, F} + ndims(A) == 3 || throw(ArgumentError("expected a 3-array of weights")) + b = create_bias(W, bias, size(W, 1)) + return new{F, A, typeof(b)}(W, b, σ) + end end @functor Bilinear function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, σ = identity; bias = true, init = glorot_uniform) - Bilinear(init(out, in1, in2), bias, σ) + return Bilinear(init(out, in1, in2), bias, σ) +end +function Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) + return Bilinear((in12, in12) => out, σ; kw...) end -Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) = Bilinear((in12, in12) => out, σ; kw...) function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix) - W, b, σ = a.weight, a.bias, a.σ + W, b, σ = a.weight, a.bias, a.σ - d_z, d_x, d_y = size(W) - d_x == size(x,1) && d_y == size(y,1) || throw(DimensionMismatch("number of rows in data must match W")) - size(x,2) == size(y,2) || throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))")) + d_z, d_x, d_y = size(W) + d_x == size(x, 1) && d_y == size(y, 1) || + throw(DimensionMismatch("number of rows in data must match W")) + size(x, 2) == size(y, 2) || + throw(DimensionMismatch("Data inputs must agree on number of columns, got $(size(x,2)) and $(size(y,2))")) - # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s] - Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :)) + # @einsum Wy[o,i,s] := W[o,i,j] * y[j,s] + Wy = reshape(reshape(W, (:, d_y)) * y, (d_z, d_x, :)) - # @einsum Z[o,s] := Wy[o,i,s] * x[i,s] - Wyx = batched_mul(Wy, reshape(x, (d_x, 1, :))) - Z = reshape(Wyx, (d_z, :)) + # @einsum Z[o,s] := Wy[o,i,s] * x[i,s] + Wyx = batched_mul(Wy, reshape(x, (d_x, 1, :))) + Z = reshape(Wyx, (d_z, :)) - # @einsum out[o,s] := σ(Z[o,i] + b[o]) - σ.(Z .+ b) + # @einsum out[o,s] := σ(Z[o,i] + b[o]) + return σ.(Z .+ b) end (a::Bilinear)(x::AbstractVecOrMat) = a(x, x) -(a::Bilinear)(x::AbstractVector, y::AbstractVector) = vec(a(reshape(x, :,1), reshape(y, :,1))) +function (a::Bilinear)(x::AbstractVector, y::AbstractVector) + return vec(a(reshape(x, :, 1), reshape(y, :, 1))) +end (a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2]) function Base.show(io::IO, l::Bilinear) - if size(l.weight, 2) == size(l.weight, 3) - print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1)) - else - print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1)) - end - l.σ == identity || print(io, ", ", l.σ) - l.bias === false && print(io, "; bias=false") - print(io, ")") + if size(l.weight, 2) == size(l.weight, 3) + print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1)) + else + print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", + size(l.weight, 1)) + end + l.σ == identity || print(io, ", ", l.σ) + l.bias === false && print(io, "; bias=false") + return print(io, ")") end """ @@ -492,19 +511,19 @@ julia> model2[:β] == model2[2] true ``` """ -struct Parallel{F, T<:Union{Tuple, NamedTuple}} - connection::F - layers::T +struct Parallel{F, T <: Union{Tuple, NamedTuple}} + connection::F + layers::T end Parallel(connection, layers...) = Parallel(connection, layers) function Parallel(connection; kw...) - layers = NamedTuple(kw) - if :layers in keys(layers) || :connection in keys(layers) - throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`")) - end - isempty(layers) && return Parallel(connection, ()) - Parallel(connection, layers) + layers = NamedTuple(kw) + if :layers in keys(layers) || :connection in keys(layers) + throw(ArgumentError("a Parallel layer cannot have a named sub-layer called `connection` or `layers`")) + end + isempty(layers) && return Parallel(connection, ()) + return Parallel(connection, layers) end @functor Parallel @@ -513,30 +532,31 @@ end (m::Parallel)(xs::Tuple) = m(xs...) function _parallel_check(layers, xs) - nl = length(layers) - nx = length(xs) - if (nl != nx) - throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs")) - end + nl = length(layers) + nx = length(xs) + if (nl != nx) + throw(ArgumentError("Parallel with $nl sub-layers can take one input or $nl inputs, but got $nx inputs")) + end end ChainRulesCore.@non_differentiable _parallel_check(nl, nx) function (m::Parallel)(xs...) - _parallel_check(m.layers, xs) - m.connection(map(|>, xs, Tuple(m.layers))...) + _parallel_check(m.layers, xs) + return m.connection(map(|>, xs, Tuple(m.layers))...) end Base.getindex(m::Parallel, i) = m.layers[i] Base.getindex(m::Parallel, i::AbstractVector) = Parallel(m.connection, m.layers[i]) -Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector) = - Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) +function Base.getindex(m::Parallel{<:Any, <:NamedTuple}, i::AbstractVector) + return Parallel(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) +end Base.keys(m::Parallel) = keys(getfield(m, :layers)) function Base.show(io::IO, m::Parallel) - print(io, "Parallel(", m.connection, ", ") - _show_layers(io, m.layers) - print(io, ")") + print(io, "Parallel(", m.connection, ", ") + _show_layers(io, m.layers) + return print(io, ")") end """ @@ -544,37 +564,40 @@ end ## Arguments -- `connection`: A function taking 2 inputs and combining them into a single output -- `layers`: The layers whose outputs are combined + - `connection`: A function taking 2 inputs and combining them into a single output + - `layers`: The layers whose outputs are combined ## Inputs This layer behaves differently based on input type: -1. If input `x` is a tuple of length N (or the input is `xs` with N `x`'s), matching the number of `layers`, - then each layer receives a new input `x[i]` combined with the previous output `y[i-1]` using `connection`. - Thus `(y1, y2, y3) = PairwiseFusion(connection, layer1, layer2, layer3)((x1, x2, x3))` - may be drawn as: + 1. If input `x` is a tuple of length N (or the input is `xs` with N `x`'s), matching the number of `layers`, + then each layer receives a new input `x[i]` combined with the previous output `y[i-1]` using `connection`. + Thus `(y1, y2, y3) = PairwiseFusion(connection, layer1, layer2, layer3)((x1, x2, x3))` + may be drawn as: + ``` x1 → layer1 → y1 ↘ connection → layer2 → y2 ↘ x2 ↗ connection → layer3 → y3 x3 ↗ ``` + ... or written as: + ```julia y1 = layer1(x1) y2 = layer2(connection(x2, y1)) y3 = layer3(connection(x3, y2)) ``` -2. With just one input, each layer receives the same `x` combined with the previous output. - Thus `y = PairwiseFusion(connection, layers...)(x)` obeys: + 2. With just one input, each layer receives the same `x` combined with the previous output. + Thus `y = PairwiseFusion(connection, layers...)(x)` obeys: ```julia y[1] == layers[1](x) for i in 2:length(layers) - y[i] == connection(x, layers[i](y[i-1])) + y[i] == connection(x, layers[i](y[i - 1])) end ``` @@ -582,78 +605,86 @@ end A tuple of length N with the output of each fusion ((`y1`, `y2`, ..., `yN`) in the example above). """ -struct PairwiseFusion{F, T<:Union{Tuple, NamedTuple}} - connection::F - layers::T +struct PairwiseFusion{F, T <: Union{Tuple, NamedTuple}} + connection::F + layers::T end PairwiseFusion(connection, layers...) = PairwiseFusion(connection, layers) function PairwiseFusion(connection; kw...) - layers = NamedTuple(kw) - if :layers in keys(layers) || :connection in keys(layers) - throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`")) - end - isempty(layers) && return PairwiseFusion(connection, ()) - PairwiseFusion(connection, layers) + layers = NamedTuple(kw) + if :layers in keys(layers) || :connection in keys(layers) + throw(ArgumentError("a PairwiseFusion layer cannot have a named sub-layer called `connection` or `layers`")) + end + isempty(layers) && return PairwiseFusion(connection, ()) + return PairwiseFusion(connection, layers) end function _pairwise_check(x, layers, T) - lx = length(x) - N = length(layers) - if T <: Tuple && lx != N - throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs")) - end + lx = length(x) + N = length(layers) + if T <: Tuple && lx != N + throw(ArgumentError("PairwiseFusion with $N sub-layers can take one input or $N inputs, but got $lx inputs")) + end end ChainRulesCore.@non_differentiable _pairwise_check(lx, N, T) function (m::PairwiseFusion)(x::T) where {T} - _pairwise_check(x, m.layers, T) - applypairwisefusion(m.layers, m.connection, x) + _pairwise_check(x, m.layers, T) + return applypairwisefusion(m.layers, m.connection, x) end (m::PairwiseFusion)(xs...) = m(xs) -@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any,N}}, connection, x::T) where {N, T} - y_symbols = [gensym() for _ in 1:(N + 1)] - getinput(i) = T <: Tuple ? :(x[$i]) : :x - calls = [:($(y_symbols[N + 1]) = $(getinput(1)))] - for i in 1:N - 1 - push!(calls, quote - $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1])) - $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1))) - end) - end - push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1])))) - push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...)))) - return Expr(:block, calls...) -end -applypairwisefusion(layers::NamedTuple, connection, x) = applypairwisefusion(Tuple(layers), connection, x) +@generated function applypairwisefusion(layers::Tuple{Vararg{<:Any, N}}, connection, + x::T) where {N, T} + y_symbols = [gensym() for _ in 1:(N + 1)] + getinput(i) = T <: Tuple ? :(x[$i]) : :x + calls = [:($(y_symbols[N + 1]) = $(getinput(1)))] + for i in 1:(N - 1) + push!(calls, + quote + $(y_symbols[i]) = layers[$i]($(y_symbols[N + 1])) + $(y_symbols[N + 1]) = connection($(y_symbols[i]), $(getinput(i + 1))) + end) + end + push!(calls, :($(y_symbols[N]) = layers[$N]($(y_symbols[N + 1])))) + push!(calls, :(return tuple($(Tuple(y_symbols[1:N])...)))) + return Expr(:block, calls...) +end +function applypairwisefusion(layers::NamedTuple, connection, x) + return applypairwisefusion(Tuple(layers), connection, x) +end @functor PairwiseFusion Base.getindex(m::PairwiseFusion, i) = m.layers[i] -Base.getindex(m::PairwiseFusion, i::AbstractVector) = PairwiseFusion(m.connection, m.layers[i]) -Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector) = - PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) +function Base.getindex(m::PairwiseFusion, i::AbstractVector) + return PairwiseFusion(m.connection, m.layers[i]) +end +function Base.getindex(m::PairwiseFusion{<:Any, <:NamedTuple}, i::AbstractVector) + return PairwiseFusion(m.connection, NamedTuple{keys(m)[i]}(Tuple(m.layers)[i])) +end Base.keys(m::PairwiseFusion) = keys(getfield(m, :layers)) function Base.show(io::IO, m::PairwiseFusion) - print(io, "PairwiseFusion(", m.connection, ", ") - _show_layers(io, m.layers) - print(io, ")") + print(io, "PairwiseFusion(", m.connection, ", ") + _show_layers(io, m.layers) + return print(io, ")") end """ Embedding(in => out; init=randn) -A lookup table that stores embeddings of dimension `out` +A lookup table that stores embeddings of dimension `out` for a vocabulary of size `in`. -This layer is often used to store word embeddings and retrieve them using indices. +This layer is often used to store word embeddings and retrieve them using indices. The input to the layer can be either a vector of indexes -or the corresponding [`onehot encoding`](@ref OneHotArrays.onehotbatch). +or the corresponding [`onehot encoding`](@ref OneHotArrays.onehotbatch). # Examples + ```jldoctest julia> vocab_size, embed_size = 1000, 4; @@ -662,7 +693,8 @@ Embedding(1000 => 4) # 4_000 parameters julia> vocab_idxs = [1, 722, 53, 220, 3]; -julia> x = Flux.onehotbatch(vocab_idxs, 1:vocab_size); summary(x) +julia> x = Flux.onehotbatch(vocab_idxs, 1:vocab_size); + summary(x); "1000×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool" julia> model(x) |> summary @@ -673,7 +705,7 @@ true ``` """ struct Embedding{W} - weight::W + weight::W end @functor Embedding @@ -684,11 +716,12 @@ Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(ini (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x) (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...) -function (m::Embedding)(x::Union{OneHotVector{T,L}, OneHotMatrix{T,L}}) where {T,L} - size(m.weight, 2) == L || throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L")) - return m(onecold(x)) +function (m::Embedding)(x::Union{OneHotVector{T, L}, OneHotMatrix{T, L}}) where {T, L} + size(m.weight, 2) == L || + throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L")) + return m(onecold(x)) end function Base.show(io::IO, m::Embedding) - print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")") + return print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")") end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index 003395c15d..428f460dd1 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -8,7 +8,9 @@ expand(N, i::Integer) = ntuple(_ -> i, N) conv_reshape_bias(c) = conv_reshape_bias(c.bias, c.stride) conv_reshape_bias(@nospecialize(bias), _) = bias -conv_reshape_bias(bias::AbstractVector, stride) = reshape(bias, map(_->1, stride)..., :, 1) +function conv_reshape_bias(bias::AbstractVector, stride) + return reshape(bias, map(_ -> 1, stride)..., :, 1) +end """ SamePad() @@ -21,22 +23,23 @@ When `stride≠1`, the output size equals `ceil(input_size/stride)`. See also [`Conv`](@ref), [`MaxPool`](@ref). # Examples + ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of images -julia> layer = Conv((2,2), 3 => 7, pad=SamePad()) +julia> layer = Conv((2, 2), 3 => 7, pad = SamePad()) Conv((2, 2), 3 => 7, pad=(1, 0, 1, 0)) # 91 parameters julia> layer(xs) |> size # notice how the dimensions stay the same with this padding (100, 100, 7, 50) -julia> layer2 = Conv((2,2), 3 => 7) +julia> layer2 = Conv((2, 2), 3 => 7) Conv((2, 2), 3 => 7) # 91 parameters julia> layer2(xs) |> size # the output dimension changes as the padding was not "same" (99, 99, 7, 50) -julia> layer3 = Conv((5, 5), 3 => 7, stride=2, pad=SamePad()) +julia> layer3 = Conv((5, 5), 3 => 7, stride = 2, pad = SamePad()) Conv((5, 5), 3 => 7, pad=2, stride=2) # 532 parameters julia> layer3(xs) |> size # output size = `ceil(input_size/stride)` = 50 @@ -45,16 +48,18 @@ julia> layer3(xs) |> size # output size = `ceil(input_size/stride)` = 50 """ struct SamePad end -calc_padding(lt, pad, k::NTuple{N,T}, dilation, stride) where {T,N} = expand(Val(2*N), pad) -function calc_padding(lt, ::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T} - #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285 +function calc_padding(lt, pad, k::NTuple{N, T}, dilation, stride) where {T, N} + return expand(Val(2 * N), pad) +end +function calc_padding(lt, ::SamePad, k::NTuple{N, T}, dilation, stride) where {N, T} + #Ref: "A guide to convolution arithmetic for deep learning" https://arxiv.org/abs/1603.07285 - # Effective kernel size, including dilation - k_eff = @. k + (k - 1) * (dilation - 1) - # How much total padding needs to be applied? - pad_amt = @. k_eff - 1 - # In case amount of padding is odd we need to apply different amounts to each side. - return Tuple(mapfoldl(i -> [cld(i, 2), fld(i,2)], vcat, pad_amt)) + # Effective kernel size, including dilation + k_eff = @. k + (k - 1) * (dilation - 1) + # How much total padding needs to be applied? + pad_amt = @. k_eff - 1 + # In case amount of padding is odd we need to apply different amounts to each side. + return Tuple(mapfoldl(i -> [cld(i, 2), fld(i, 2)], vcat, pad_amt)) end """ @@ -75,56 +80,61 @@ To take convolutions along `N` feature dimensions, this layer expects as input a with `ndims(x) == N+2`, where `size(x, N+1) == in` is the number of input channels, and `size(x, ndims(x))` is (as always) the number of observations in a batch. Then: -* `filter` should be a tuple of `N` integers. -* Keywords `stride` and `dilation` should each be either single integer, - or a tuple with `N` integers. -* Keyword `pad` specifies the number of elements added to the borders of the data array. It can be - - a single integer for equal padding all around, - - a tuple of `N` integers, to apply the same padding at begin/end of each spatial dimension, - - a tuple of `2*N` integers, for asymmetric padding, or - - the singleton `SamePad()`, to calculate padding such that - `size(output,d) == size(x,d) / stride` (possibly rounded) for each spatial dimension. -* Keyword `groups` is expected to be an `Int`. It specifies the number of groups - to divide a convolution into. + + - `filter` should be a tuple of `N` integers. + + - Keywords `stride` and `dilation` should each be either single integer, + or a tuple with `N` integers. + - Keyword `pad` specifies the number of elements added to the borders of the data array. It can be + + + a single integer for equal padding all around, + + a tuple of `N` integers, to apply the same padding at begin/end of each spatial dimension, + + a tuple of `2*N` integers, for asymmetric padding, or + + the singleton `SamePad()`, to calculate padding such that + `size(output,d) == size(x,d) / stride` (possibly rounded) for each spatial dimension. + - Keyword `groups` is expected to be an `Int`. It specifies the number of groups + to divide a convolution into. Keywords to control initialization of the layer: -* `init` - Function used to generate initial weights. Defaults to `glorot_uniform`. -* `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely - by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`. + + - `init` - Function used to generate initial weights. Defaults to `glorot_uniform`. + - `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely + by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`. See also [`ConvTranspose`](@ref), [`DepthwiseConv`](@ref), [`CrossCor`](@ref). # Examples + ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of images -julia> layer = Conv((5,5), 3 => 7, relu; bias = false) +julia> layer = Conv((5, 5), 3 => 7, relu; bias = false) Conv((5, 5), 3 => 7, relu, bias=false) # 525 parameters julia> layer(xs) |> size (96, 96, 7, 50) -julia> Conv((5,5), 3 => 7; stride = 2)(xs) |> size +julia> Conv((5, 5), 3 => 7; stride = 2)(xs) |> size (48, 48, 7, 50) -julia> Conv((5,5), 3 => 7; stride = 2, pad = SamePad())(xs) |> size +julia> Conv((5, 5), 3 => 7; stride = 2, pad = SamePad())(xs) |> size (50, 50, 7, 50) -julia> Conv((1,1), 3 => 7; pad = (20,10,0,0))(xs) |> size +julia> Conv((1, 1), 3 => 7; pad = (20, 10, 0, 0))(xs) |> size (130, 100, 7, 50) -julia> Conv((5,5), 3 => 7; stride = 2, dilation = 4)(xs) |> size +julia> Conv((5, 5), 3 => 7; stride = 2, dilation = 4)(xs) |> size (42, 42, 7, 50) ``` """ -struct Conv{N,M,F,A,V} - σ::F - weight::A - bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} - groups::Int +struct Conv{N, M, F, A, V} + σ::F + weight::A + bias::V + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} + groups::Int end """ @@ -149,23 +159,21 @@ julia> Flux.params(layer) |> length 2 ``` """ -function Conv(w::AbstractArray{T,N}, b = true, σ = identity; - stride = 1, pad = 0, dilation = 1, groups = 1) where {T,N} - - @assert size(w, N) % groups == 0 "Output channel dimension must be divisible by groups." - stride = expand(Val(N-2), stride) - dilation = expand(Val(N-2), dilation) - pad = calc_padding(Conv, pad, size(w)[1:N-2], dilation, stride) - bias = create_bias(w, b, size(w, N)) - return Conv(σ, w, bias, stride, pad, dilation, groups) +function Conv(w::AbstractArray{T, N}, b = true, σ = identity; + stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N} + @assert size(w, N) % groups==0 "Output channel dimension must be divisible by groups." + stride = expand(Val(N - 2), stride) + dilation = expand(Val(N - 2), dilation) + pad = calc_padding(Conv, pad, size(w)[1:(N - 2)], dilation, stride) + bias = create_bias(w, b, size(w, N)) + return Conv(σ, w, bias, stride, pad, dilation, groups) end -function Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; - init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1, - bias = true) where N - - weight = convfilter(k, ch; init, groups) - Conv(weight, bias, σ; stride, pad, dilation, groups) +function Conv(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; + init = glorot_uniform, stride = 1, pad = 0, dilation = 1, groups = 1, + bias = true) where {N} + weight = convfilter(k, ch; init, groups) + return Conv(weight, bias, σ; stride, pad, dilation, groups) end """ @@ -179,46 +187,48 @@ distribution. This is internally used by the [`Conv`](@ref) layer. """ -function convfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}; - init = glorot_uniform, groups = 1) where N - cin, cout = ch - @assert cin % groups == 0 "Input channel dimension must be divisible by groups." - @assert cout % groups == 0 "Output channel dimension must be divisible by groups." - init(filter..., cin÷groups, cout) +function convfilter(filter::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}; + init = glorot_uniform, groups = 1) where {N} + cin, cout = ch + @assert cin % groups==0 "Input channel dimension must be divisible by groups." + @assert cout % groups==0 "Output channel dimension must be divisible by groups." + return init(filter..., cin ÷ groups, cout) end @functor Conv -conv_dims(c::Conv, x::AbstractArray) = - DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation, groups = c.groups) +function conv_dims(c::Conv, x::AbstractArray) + return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, + dilation = c.dilation, groups = c.groups) +end ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any) function (c::Conv)(x::AbstractArray) - σ = NNlib.fast_act(c.σ, x) - cdims = conv_dims(c, x) - σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c)) + σ = NNlib.fast_act(c.σ, x) + cdims = conv_dims(c, x) + return σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c)) end -_channels_in(l::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups +_channels_in(l::Conv) = size(l.weight, ndims(l.weight) - 1) * l.groups _channels_out(l::Conv) = size(l.weight, ndims(l.weight)) function Base.show(io::IO, l::Conv) - print(io, "Conv(", size(l.weight)[1:ndims(l.weight)-2]) - print(io, ", ", _channels_in(l), " => ", _channels_out(l)) - _print_conv_opt(io, l) - print(io, ")") + print(io, "Conv(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, ", ", _channels_in(l), " => ", _channels_out(l)) + _print_conv_opt(io, l) + return print(io, ")") end function _print_conv_opt(io::IO, l) - l.σ == identity || print(io, ", ", l.σ) - all(==(0), l.pad) || print(io, ", pad=", _maybetuple_string(l.pad)) - all(==(1), l.stride) || print(io, ", stride=", _maybetuple_string(l.stride)) - all(==(1), l.dilation) || print(io, ", dilation=", _maybetuple_string(l.dilation)) - if hasproperty(l, :groups) - (l.groups == 1) || print(io, ", groups=", l.groups) - end - (l.bias === false) && print(io, ", bias=false") + l.σ == identity || print(io, ", ", l.σ) + all(==(0), l.pad) || print(io, ", pad=", _maybetuple_string(l.pad)) + all(==(1), l.stride) || print(io, ", stride=", _maybetuple_string(l.stride)) + all(==(1), l.dilation) || print(io, ", dilation=", _maybetuple_string(l.dilation)) + if hasproperty(l, :groups) + (l.groups == 1) || print(io, ", groups=", l.groups) + end + return (l.bias === false) && print(io, ", bias=false") end """ @@ -236,34 +246,35 @@ Parameters are controlled by additional keywords, with defaults See also [`Conv`](@ref) for more detailed description of keywords. # Examples + ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of 50 RGB images -julia> layer = ConvTranspose((5,5), 3 => 7, relu) +julia> layer = ConvTranspose((5, 5), 3 => 7, relu) ConvTranspose((5, 5), 3 => 7, relu) # 532 parameters julia> layer(xs) |> size (104, 104, 7, 50) -julia> ConvTranspose((5,5), 3 => 7, stride=2)(xs) |> size +julia> ConvTranspose((5, 5), 3 => 7, stride = 2)(xs) |> size (203, 203, 7, 50) -julia> ConvTranspose((5,5), 3 => 7, stride=3, pad=SamePad())(xs) |> size +julia> ConvTranspose((5, 5), 3 => 7, stride = 3, pad = SamePad())(xs) |> size (300, 300, 7, 50) ``` """ -struct ConvTranspose{N,M,F,A,V} - σ::F - weight::A - bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} - groups::Int +struct ConvTranspose{N, M, F, A, V} + σ::F + weight::A + bias::V + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} + groups::Int end -_channels_in(l::ConvTranspose) = size(l.weight)[end] -_channels_out(l::ConvTranspose) = size(l.weight)[end-1]*l.groups +_channels_in(l::ConvTranspose) = size(l.weight)[end] +_channels_out(l::ConvTranspose) = size(l.weight)[end - 1] * l.groups """ ConvTranspose(weight::AbstractArray, [bias, activation; stride, pad, dilation, groups]) @@ -273,6 +284,7 @@ Accepts the same keywords and has the same defaults as [`ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ; ...)`](@ref ConvTranspose). # Examples + ```jldoctest julia> weight = rand(3, 4, 5); @@ -288,66 +300,65 @@ julia> Flux.params(layer) |> length 2 ``` """ -function ConvTranspose(w::AbstractArray{T,N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1, groups=1) where {T,N} - stride = expand(Val(N-2), stride) - dilation = expand(Val(N-2), dilation) - pad = calc_padding(ConvTranspose, pad, size(w)[1:N-2], dilation, stride) - b = create_bias(w, bias, size(w, N-1) * groups) - return ConvTranspose(σ, w, b, stride, pad, dilation, groups) +function ConvTranspose(w::AbstractArray{T, N}, bias = true, σ = identity; + stride = 1, pad = 0, dilation = 1, groups = 1) where {T, N} + stride = expand(Val(N - 2), stride) + dilation = expand(Val(N - 2), dilation) + pad = calc_padding(ConvTranspose, pad, size(w)[1:(N - 2)], dilation, stride) + b = create_bias(w, bias, size(w, N - 1) * groups) + return ConvTranspose(σ, w, b, stride, pad, dilation, groups) end -function ConvTranspose(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; - init = glorot_uniform, stride = 1, pad = 0, dilation = 1, - groups = 1, - bias = true, - ) where N - - weight = convfilter(k, reverse(ch); init, groups) - ConvTranspose(weight, bias, σ; stride, pad, dilation, groups) +function ConvTranspose(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; + init = glorot_uniform, stride = 1, pad = 0, dilation = 1, + groups = 1, + bias = true) where {N} + weight = convfilter(k, reverse(ch); init, groups) + return ConvTranspose(weight, bias, σ; stride, pad, dilation, groups) end @functor ConvTranspose function conv_transpose_dims(c::ConvTranspose, x::AbstractArray) - # Calculate size of "input", from ∇conv_data()'s perspective... - combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end]) - I = (size(x)[1:end-2] .- 1).*c.stride .+ 1 .+ (size(c.weight)[1:end-2] .- 1).*c.dilation .- combined_pad - C_in = size(c.weight)[end-1] * c.groups - batch_size = size(x)[end] - # Create DenseConvDims() that looks like the corresponding conv() - w_size = size(c.weight) - return DenseConvDims((I..., C_in, batch_size), w_size; - stride=c.stride, - padding=c.pad, - dilation=c.dilation, - groups=c.groups, - ) + # Calculate size of "input", from ∇conv_data()'s perspective... + combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end]) + I = (size(x)[1:(end - 2)] .- 1) .* c.stride .+ 1 .+ + (size(c.weight)[1:(end - 2)] .- 1) .* c.dilation .- combined_pad + C_in = size(c.weight)[end - 1] * c.groups + batch_size = size(x)[end] + # Create DenseConvDims() that looks like the corresponding conv() + w_size = size(c.weight) + return DenseConvDims((I..., C_in, batch_size), w_size; + stride = c.stride, + padding = c.pad, + dilation = c.dilation, + groups = c.groups) end ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any) function (c::ConvTranspose)(x::AbstractArray) - σ = NNlib.fast_act(c.σ, x) - cdims = conv_transpose_dims(c, x) - σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c)) + σ = NNlib.fast_act(c.σ, x) + cdims = conv_transpose_dims(c, x) + return σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c)) end function Base.show(io::IO, l::ConvTranspose) - print(io, "ConvTranspose(", size(l.weight)[1:ndims(l.weight)-2]) - print(io, ", ", _channels_in(l), " => ", _channels_out(l)) - _print_conv_opt(io, l) - print(io, ")") + print(io, "ConvTranspose(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, ", ", _channels_in(l), " => ", _channels_out(l)) + _print_conv_opt(io, l) + return print(io, ")") end -function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N,T}, dilation, stride) where {N,T} - calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride) +function calc_padding(::Type{ConvTranspose}, pad::SamePad, k::NTuple{N, T}, dilation, + stride) where {N, T} + return calc_padding(Conv, pad, k .- stride .+ 1, dilation, stride) end """ DepthwiseConv(filter, in => out, σ=identity; stride=1, pad=0, dilation=1, [bias, init]) DepthwiseConv(weight::AbstractArray, [bias, activation; stride, pad, dilation]) - + Return a depthwise convolutional layer, that is a [`Conv`](@ref) layer with number of groups equal to the number of input channels. @@ -358,28 +369,29 @@ See [`Conv`](@ref) for a description of the arguments. ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of 50 RGB images -julia> layer = DepthwiseConv((5,5), 3 => 6, relu; bias=false) -Conv((5, 5), 3 => 6, relu, groups=3, bias=false) # 150 parameters +julia> layer = DepthwiseConv((5, 5), 3 => 6, relu; bias = false) +Conv((5, 5), 3 => 6, relu, groups=3, bias=false) # 150 parameters julia> layer(xs) |> size (96, 96, 6, 50) -julia> DepthwiseConv((5, 5), 3 => 9, stride=2, pad=2)(xs) |> size +julia> DepthwiseConv((5, 5), 3 => 9, stride = 2, pad = 2)(xs) |> size (50, 50, 9, 50) ``` """ -function DepthwiseConv(k::NTuple{<:Any,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; - stride = 1, pad = 0, dilation = 1, bias = true, init = glorot_uniform) - Conv(k, ch, σ; groups=ch.first, stride, pad, dilation, bias, init) +function DepthwiseConv(k::NTuple{<:Any, Integer}, ch::Pair{<:Integer, <:Integer}, + σ = identity; + stride = 1, pad = 0, dilation = 1, bias = true, + init = glorot_uniform) + return Conv(k, ch, σ; groups = ch.first, stride, pad, dilation, bias, init) end -function DepthwiseConv(w::AbstractArray{T,N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1) where {T,N} - w2 = reshape(w, size(w)[1:end-2]..., 1, :) - Conv(w2, bias, σ; groups = size(w)[end-1], stride, pad, dilation) +function DepthwiseConv(w::AbstractArray{T, N}, bias = true, σ = identity; + stride = 1, pad = 0, dilation = 1) where {T, N} + w2 = reshape(w, size(w)[1:(end - 2)]..., 1, :) + return Conv(w2, bias, σ; groups = size(w)[end - 1], stride, pad, dilation) end - """ CrossCor(filter, in => out, σ=identity; stride=1, pad=0, dilation=1, [bias, init]) @@ -397,23 +409,23 @@ See also [`Conv`](@ref) for more detailed description of keywords. ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # a batch of 50 RGB images -julia> layer = CrossCor((5,5), 3 => 6, relu; bias=false) +julia> layer = CrossCor((5, 5), 3 => 6, relu; bias = false) CrossCor((5, 5), 3 => 6, relu, bias=false) # 450 parameters julia> layer(xs) |> size (96, 96, 6, 50) -julia> CrossCor((5,5), 3 => 7, stride=3, pad=(2,0))(xs) |> size +julia> CrossCor((5, 5), 3 => 7, stride = 3, pad = (2, 0))(xs) |> size (34, 32, 7, 50) ``` """ -struct CrossCor{N,M,F,A,V} - σ::F - weight::A - bias::V - stride::NTuple{N,Int} - pad::NTuple{M,Int} - dilation::NTuple{N,Int} +struct CrossCor{N, M, F, A, V} + σ::F + weight::A + bias::V + stride::NTuple{N, Int} + pad::NTuple{M, Int} + dilation::NTuple{N, Int} end """ @@ -424,6 +436,7 @@ Accepts the same keywords and has the same defaults as [`CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ; ...)`](@ref CrossCor). # Examples + ```jldoctest julia> weight = rand(3, 4, 5); @@ -436,46 +449,48 @@ julia> layer(randn(100, 4, 64)) |> size (98, 5, 64) ``` """ -function CrossCor(w::AbstractArray{T,N}, bias = true, σ = identity; - stride = 1, pad = 0, dilation = 1) where {T,N} - stride = expand(Val(N-2), stride) - dilation = expand(Val(N-2), dilation) - pad = calc_padding(CrossCor, pad, size(w)[1:N-2], dilation, stride) - b = create_bias(w, bias, size(w, N)) - return CrossCor(σ, w, b, stride, pad, dilation) +function CrossCor(w::AbstractArray{T, N}, bias = true, σ = identity; + stride = 1, pad = 0, dilation = 1) where {T, N} + stride = expand(Val(N - 2), stride) + dilation = expand(Val(N - 2), dilation) + pad = calc_padding(CrossCor, pad, size(w)[1:(N - 2)], dilation, stride) + b = create_bias(w, bias, size(w, N)) + return CrossCor(σ, w, b, stride, pad, dilation) end -function CrossCor(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity; +function CrossCor(k::NTuple{N, Integer}, ch::Pair{<:Integer, <:Integer}, σ = identity; init = glorot_uniform, stride = 1, pad = 0, dilation = 1, - bias = true) where N - - weight = convfilter(k, ch, init = init) - return CrossCor(weight, bias, σ; stride, pad, dilation) + bias = true) where {N} + weight = convfilter(k, ch, init = init) + return CrossCor(weight, bias, σ; stride, pad, dilation) end @functor CrossCor function crosscor(x, w, ddims::DenseConvDims) - ddims = DenseConvDims(ddims, F=true) - return conv(x, w, ddims) + ddims = DenseConvDims(ddims, F = true) + return conv(x, w, ddims) end -crosscor_dims(c::CrossCor, x::AbstractArray) = - DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation) +function crosscor_dims(c::CrossCor, x::AbstractArray) + return DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, + dilation = c.dilation) +end ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any) function (c::CrossCor)(x::AbstractArray) - σ = NNlib.fast_act(c.σ, x) - cdims = crosscor_dims(c, x) - σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c)) + σ = NNlib.fast_act(c.σ, x) + cdims = crosscor_dims(c, x) + return σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c)) end function Base.show(io::IO, l::CrossCor) - print(io, "CrossCor(", size(l.weight)[1:ndims(l.weight)-2]) - print(io, ", ", size(l.weight, ndims(l.weight)-1), " => ", size(l.weight, ndims(l.weight))) - _print_conv_opt(io, l) - print(io, ")") + print(io, "CrossCor(", size(l.weight)[1:(ndims(l.weight) - 2)]) + print(io, ", ", size(l.weight, ndims(l.weight) - 1), " => ", + size(l.weight, ndims(l.weight))) + _print_conv_opt(io, l) + return print(io, ")") end """ @@ -490,33 +505,34 @@ batch dimensions, after the `N` feature dimensions, where `N = length(out)`. See also [`MaxPool`](@ref), [`AdaptiveMeanPool`](@ref). # Examples + ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # batch of 50 RGB images julia> AdaptiveMaxPool((25, 25))(xs) |> size (25, 25, 3, 50) -julia> MaxPool((4,4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs) +julia> MaxPool((4, 4))(xs) ≈ AdaptiveMaxPool((25, 25))(xs) true ``` """ struct AdaptiveMaxPool{S, O} - out::NTuple{O, Int} - AdaptiveMaxPool(out::NTuple{O, Int}) where O = new{O + 2, O}(out) + out::NTuple{O, Int} + AdaptiveMaxPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) end function (a::AdaptiveMaxPool{S})(x::AbstractArray{T, S}) where {S, T} - insize = size(x)[1:end-2] - outsize = a.out - stride = insize .÷ outsize - k = insize .- (outsize .- 1) .* stride - pad = 0 - pdims = PoolDims(x, k; padding=pad, stride=stride) - return maxpool(x, pdims) + insize = size(x)[1:(end - 2)] + outsize = a.out + stride = insize .÷ outsize + k = insize .- (outsize .- 1) .* stride + pad = 0 + pdims = PoolDims(x, k; padding = pad, stride = stride) + return maxpool(x, pdims) end function Base.show(io::IO, a::AdaptiveMaxPool) - print(io, "AdaptiveMaxPool(", a.out, ")") + return print(io, "AdaptiveMaxPool(", a.out, ")") end """ @@ -531,33 +547,34 @@ batch dimensions, after the `N` feature dimensions, where `N = length(out)`. See also [`MaxPool`](@ref), [`AdaptiveMaxPool`](@ref). # Examples + ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # batch of 50 RGB images julia> AdaptiveMeanPool((25, 25))(xs) |> size (25, 25, 3, 50) -julia> MeanPool((4,4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs) +julia> MeanPool((4, 4))(xs) ≈ AdaptiveMeanPool((25, 25))(xs) true ``` """ struct AdaptiveMeanPool{S, O} - out::NTuple{O, Int} - AdaptiveMeanPool(out::NTuple{O, Int}) where O = new{O + 2, O}(out) + out::NTuple{O, Int} + AdaptiveMeanPool(out::NTuple{O, Int}) where {O} = new{O + 2, O}(out) end function (a::AdaptiveMeanPool{S})(x::AbstractArray{T, S}) where {S, T} - insize = size(x)[1:end-2] - outsize = a.out - stride = insize .÷ outsize - k = insize .- (outsize .- 1) .* stride - pad = 0 - pdims = PoolDims(x, k; padding=pad, stride=stride) - return meanpool(x, pdims) + insize = size(x)[1:(end - 2)] + outsize = a.out + stride = insize .÷ outsize + k = insize .- (outsize .- 1) .* stride + pad = 0 + pdims = PoolDims(x, k; padding = pad, stride = stride) + return meanpool(x, pdims) end function Base.show(io::IO, a::AdaptiveMeanPool) - print(io, "AdaptiveMeanPool(", a.out, ")") + return print(io, "AdaptiveMeanPool(", a.out, ")") end """ @@ -573,30 +590,30 @@ See also [`MaxPool`](@ref), [`GlobalMeanPool`](@ref). ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); -julia> m = Chain(Conv((3,3), 3 => 7), GlobalMaxPool()); +julia> m = Chain(Conv((3, 3), 3 => 7), GlobalMaxPool()); julia> m(xs) |> size (1, 1, 7, 50) -julia> GlobalMaxPool()(rand(3,5,7)) |> size # preserves 2 dimensions +julia> GlobalMaxPool()(rand(3, 5, 7)) |> size # preserves 2 dimensions (1, 5, 7) ``` """ struct GlobalMaxPool end function (g::GlobalMaxPool)(x) - # Input size - x_size = size(x) - # Kernel size - k = x_size[1:end-2] - # Pooling dimensions - pdims = PoolDims(x, k) + # Input size + x_size = size(x) + # Kernel size + k = x_size[1:(end - 2)] + # Pooling dimensions + pdims = PoolDims(x, k) - return maxpool(x, pdims) + return maxpool(x, pdims) end function Base.show(io::IO, g::GlobalMaxPool) - print(io, "GlobalMaxPool()") + return print(io, "GlobalMaxPool()") end """ @@ -610,7 +627,7 @@ by performing mean pooling on the complete (w,h)-shaped feature maps. ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); -julia> m = Chain(Conv((3,3), 3 => 7), GlobalMeanPool()); +julia> m = Chain(Conv((3, 3), 3 => 7), GlobalMeanPool()); julia> m(xs) |> size (1, 1, 7, 50) @@ -619,18 +636,18 @@ julia> m(xs) |> size struct GlobalMeanPool end function (g::GlobalMeanPool)(x) - # Input size - x_size = size(x) - # Kernel size - k = x_size[1:end-2] - # Pooling dimensions - pdims = PoolDims(x, k) + # Input size + x_size = size(x) + # Kernel size + k = x_size[1:(end - 2)] + # Pooling dimensions + pdims = PoolDims(x, k) - return meanpool(x, pdims) + return meanpool(x, pdims) end function Base.show(io::IO, g::GlobalMeanPool) - print(io, "GlobalMeanPool()") + return print(io, "GlobalMeanPool()") end """ @@ -653,7 +670,7 @@ See also [`Conv`](@ref), [`MeanPool`](@ref), [`AdaptiveMaxPool`](@ref), [`Global ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); # batch of 50 RGB images -julia> m = Chain(Conv((5, 5), 3 => 7, pad=SamePad()), MaxPool((5, 5), pad=SamePad())) +julia> m = Chain(Conv((5, 5), 3 => 7, pad = SamePad()), MaxPool((5, 5), pad = SamePad())) Chain( Conv((5, 5), 3 => 7, pad=2), # 532 parameters MaxPool((5, 5), pad=2), @@ -665,39 +682,39 @@ julia> m[1](xs) |> size julia> m(xs) |> size (20, 20, 7, 50) -julia> layer = MaxPool((5,), pad=2, stride=(3,)) # one-dimensional window +julia> layer = MaxPool((5,), pad = 2, stride = (3,)) # one-dimensional window MaxPool((5,), pad=2, stride=3) julia> layer(rand(Float32, 100, 7, 50)) |> size (34, 7, 50) ``` """ -struct MaxPool{N,M} - k::NTuple{N,Int} - pad::NTuple{M,Int} - stride::NTuple{N,Int} +struct MaxPool{N, M} + k::NTuple{N, Int} + pad::NTuple{M, Int} + stride::NTuple{N, Int} end -function MaxPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N - stride = expand(Val(N), stride) - pad = calc_padding(MaxPool, pad, k, 1, stride) - return MaxPool(k, pad, stride) +function MaxPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} + stride = expand(Val(N), stride) + pad = calc_padding(MaxPool, pad, k, 1, stride) + return MaxPool(k, pad, stride) end function (m::MaxPool)(x) - pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride) - return maxpool(x, pdims) + pdims = PoolDims(x, m.k; padding = m.pad, stride = m.stride) + return maxpool(x, pdims) end function Base.show(io::IO, m::MaxPool) - print(io, "MaxPool(", m.k) - all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad)) - m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride)) - print(io, ")") + print(io, "MaxPool(", m.k) + all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad)) + m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride)) + return print(io, ")") end _maybetuple_string(pad) = string(pad) -_maybetuple_string(pad::Tuple) = all(==(pad[1]), pad) ? string(pad[1]) : string(pad) +_maybetuple_string(pad::Tuple) = all(==(pad[1]), pad) ? string(pad[1]) : string(pad) """ MeanPool(window::NTuple; pad=0, stride=window) @@ -718,7 +735,7 @@ See also [`Conv`](@ref), [`MaxPool`](@ref), [`AdaptiveMeanPool`](@ref). ```jldoctest julia> xs = rand(Float32, 100, 100, 3, 50); -julia> m = Chain(Conv((5,5), 3 => 7), MeanPool((5,5), pad=SamePad())) +julia> m = Chain(Conv((5, 5), 3 => 7), MeanPool((5, 5), pad = SamePad())) Chain( Conv((5, 5), 3 => 7), # 532 parameters MeanPool((5, 5), pad=2), @@ -731,26 +748,26 @@ julia> m(xs) |> size (20, 20, 7, 50) ``` """ -struct MeanPool{N,M} - k::NTuple{N,Int} - pad::NTuple{M,Int} - stride::NTuple{N,Int} +struct MeanPool{N, M} + k::NTuple{N, Int} + pad::NTuple{M, Int} + stride::NTuple{N, Int} end -function MeanPool(k::NTuple{N,Integer}; pad = 0, stride = k) where N - stride = expand(Val(N), stride) - pad = calc_padding(MeanPool, pad, k, 1, stride) - return MeanPool(k, pad, stride) +function MeanPool(k::NTuple{N, Integer}; pad = 0, stride = k) where {N} + stride = expand(Val(N), stride) + pad = calc_padding(MeanPool, pad, k, 1, stride) + return MeanPool(k, pad, stride) end function (m::MeanPool)(x) - pdims = PoolDims(x, m.k; padding=m.pad, stride=m.stride) - return meanpool(x, pdims) + pdims = PoolDims(x, m.k; padding = m.pad, stride = m.stride) + return meanpool(x, pdims) end function Base.show(io::IO, m::MeanPool) - print(io, "MeanPool(", m.k) - all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad)) - m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride)) - print(io, ")") + print(io, "MeanPool(", m.k) + all(==(0), m.pad) || print(io, ", pad=", _maybetuple_string(m.pad)) + m.stride == m.k || print(io, ", stride=", _maybetuple_string(m.stride)) + return print(io, ")") end diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 0f2696a50a..e832ce184a 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -5,7 +5,7 @@ ChainRulesCore.rrule(::typeof(istraining)) = true, _ -> (NoTangent(),) _isactive(m) = isnothing(m.active) ? istraining() : m.active _dropout_shape(s, ::Colon) = size(s) -_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...) +_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) in enumerate(size(s)))...) _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) @@ -29,22 +29,23 @@ automatically managed using the [`Dropout`](@ref) layer instead of the The [`Dropout`](@ref) layer is what you should use in most scenarios. """ -function dropout(rng, x, p; dims=:, active::Bool=true) - active || return x - y = dropout_mask(rng, x, p, dims=dims) - return x .* y +function dropout(rng, x, p; dims = :, active::Bool = true) + active || return x + y = dropout_mask(rng, x, p, dims = dims) + return x .* y end dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -dropout_mask(rng, x::CuArray, p; kwargs...) = - throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) +function dropout_mask(rng, x::CuArray, p; kwargs...) + throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) +end dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -function _dropout_mask(rng, x, p; dims=:) - realfptype = float(real(eltype(x))) - y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims))) - y .= _dropout_kernel.(y, p, 1 - p) - return y +function _dropout_mask(rng, x, p; dims = :) + realfptype = float(real(eltype(x))) + y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims))) + y .= _dropout_kernel.(y, p, 1 - p) + return y end # TODO move this to NNlib @@ -56,9 +57,9 @@ ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any) Dropout layer. While training, for each input, this layer either sets that input to `0` (with probability -`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the +`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the `dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input -(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during +(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during training. In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more @@ -70,6 +71,7 @@ Custom RNGs are only supported on the CPU. Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. # Examples + ```jldoctest julia> m = Chain(Dense(1 => 1), Dropout(1)); @@ -86,38 +88,39 @@ julia> Flux.trainmode!(m); julia> y = m(ones(1000)); -julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1) +julia> isapprox(count(==(0), y) / length(y), 0.5, atol = 0.1) true ``` """ -mutable struct Dropout{F,D,R<:AbstractRNG} - p::F - dims::D - active::Union{Bool, Nothing} - rng::R +mutable struct Dropout{F, D, R <: AbstractRNG} + p::F + dims::D + active::Union{Bool, Nothing} + rng::R end Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p; dims=:, rng = default_rng_value()) - @assert 0 ≤ p ≤ 1 - Dropout(p, dims, nothing, rng) +function Dropout(p; dims = :, rng = default_rng_value()) + @assert 0 ≤ p ≤ 1 + return Dropout(p, dims, nothing, rng) end @functor Dropout trainable(a::Dropout) = (;) function (a::Dropout)(x) - _isactive(a) || return x - return dropout(a.rng, x, a.p; dims=a.dims, active=true) + _isactive(a) || return x + return dropout(a.rng, x, a.p; dims = a.dims, active = true) end -testmode!(m::Dropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +function testmode!(m::Dropout, mode = true) + return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +end function Base.show(io::IO, d::Dropout) - print(io, "Dropout(", d.p) - d.dims != (:) && print(io, ", dims = $(repr(d.dims))") - print(io, ")") + print(io, "Dropout(", d.p) + d.dims != (:) && print(io, ", dims = $(repr(d.dims))") + return print(io, ")") end """ @@ -131,10 +134,11 @@ remain the same as before. Does nothing to the input once [`testmode!`](@ref) is true. # Examples + ```jldoctest julia> using Statistics -julia> x = randn(1000,1); +julia> x = randn(1000, 1); julia> m = Chain(Dense(1000 => 1000, selu), AlphaDropout(0.2)); @@ -142,18 +146,18 @@ julia> Flux.trainmode!(m); julia> y = m(x); -julia> isapprox(std(x), std(y), atol=0.2) +julia> isapprox(std(x), std(y), atol = 0.2) true ``` """ -mutable struct AlphaDropout{F,R<:AbstractRNG} - p::F - active::Union{Bool, Nothing} - rng::R - function AlphaDropout(p, active, rng) - @assert 0 ≤ p ≤ 1 - new{typeof(p), typeof(rng)}(p, active, rng) - end +mutable struct AlphaDropout{F, R <: AbstractRNG} + p::F + active::Union{Bool, Nothing} + rng::R + function AlphaDropout(p, active, rng) + @assert 0 ≤ p ≤ 1 + return new{typeof(p), typeof(rng)}(p, active, rng) + end end AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) @@ -161,22 +165,23 @@ AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) @functor AlphaDropout trainable(a::AlphaDropout) = (;) -function (a::AlphaDropout)(x::AbstractArray{T}) where T - _isactive(a) || return x - p = a.p - iszero(p) && return x - isone(p) && return sign.(x) .* T(0) +function (a::AlphaDropout)(x::AbstractArray{T}) where {T} + _isactive(a) || return x + p = a.p + iszero(p) && return x + isone(p) && return sign.(x) .* T(0) - α′ = T(-1.7580993408473766) # selu(-Inf) == -λα - A = T(inv(sqrt((1 - p) * (1 + p * α′^2)))) - B = T(-A * α′ * p) + α′ = T(-1.7580993408473766) # selu(-Inf) == -λα + A = T(inv(sqrt((1 - p) * (1 + p * α′^2)))) + B = T(-A * α′ * p) - noise = rand!(a.rng, similar(x)) - return A .* ifelse.(noise .> p, x, α′) .+ B + noise = rand!(a.rng, similar(x)) + return A .* ifelse.(noise .> p, x, α′) .+ B end -testmode!(m::AlphaDropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +function testmode!(m::AlphaDropout, mode = true) + return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +end """ LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5) @@ -196,6 +201,7 @@ using the [`Scale`](@ref) layer. See also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref). # Examples + ```jldoctest julia> using Statistics @@ -205,79 +211,78 @@ julia> m = LayerNorm(3); julia> y = m(xs); -julia> isapprox(std(y, dims=1:3), ones(1, 1, 1, 2), atol=0.1) && std(y, dims=1:3) != std(xs, dims=1:3) +julia> isapprox(std(y, dims = 1:3), ones(1, 1, 1, 2), atol = 0.1) && + std(y, dims = 1:3) != std(xs, dims = 1:3) true ``` """ -struct LayerNorm{F,D,T,N} - λ::F - diag::D - ϵ::T - size::NTuple{N,Int} - affine::Bool +struct LayerNorm{F, D, T, N} + λ::F + diag::D + ϵ::T + size::NTuple{N, Int} + affine::Bool end -function LayerNorm(size::Tuple{Vararg{Int}}, λ=identity; affine::Bool=true, ϵ::Real=1f-5) - diag = affine ? Scale(size..., λ) : λ!=identity ? Base.Fix1(broadcast, λ) : identity - return LayerNorm(λ, diag, ϵ, size, affine) +function LayerNorm(size::Tuple{Vararg{Int}}, λ = identity; affine::Bool = true, + ϵ::Real = 1.0f-5) + diag = affine ? Scale(size..., λ) : λ != identity ? Base.Fix1(broadcast, λ) : identity + return LayerNorm(λ, diag, ϵ, size, affine) end LayerNorm(size::Integer...; kw...) = LayerNorm(Int.(size); kw...) -LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end]; kw...) +LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:(end - 1)]), size_act[end]; kw...) @functor LayerNorm -(a::LayerNorm)(x) = a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ)) +(a::LayerNorm)(x) = a.diag(normalise(x, dims = 1:length(a.size), ϵ = a.ϵ)) function Base.show(io::IO, l::LayerNorm) - print(io, "LayerNorm(", join(l.size, ", ")) - l.λ === identity || print(io, ", ", l.λ) - hasaffine(l) || print(io, ", affine=false") - print(io, ")") + print(io, "LayerNorm(", join(l.size, ", ")) + l.λ === identity || print(io, ", ", l.λ) + hasaffine(l) || print(io, ", affine=false") + return print(io, ")") end # For InstanceNorm, GroupNorm, and BatchNorm. # Compute the statistics on the slices specified by reduce_dims. # reduce_dims=[1,...,N-2,N] for BatchNorm # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm -function _norm_layer_forward( - l, x::AbstractArray{T, N}; reduce_dims, affine_shape, -) where {T, N} - if !_isactive(l) && l.track_stats # testmode with tracked stats - stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) - μ = reshape(l.μ, stats_shape) - σ² = reshape(l.σ², stats_shape) - else # trainmode or testmode without tracked stats - μ = mean(x; dims=reduce_dims) - σ² = var(x; mean=μ, dims=reduce_dims, corrected=false) - if l.track_stats - _track_stats!(l, x, μ, σ², reduce_dims) # update moving mean/std +function _norm_layer_forward(l, x::AbstractArray{T, N}; reduce_dims, + affine_shape) where {T, N} + if !_isactive(l) && l.track_stats # testmode with tracked stats + stats_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) + μ = reshape(l.μ, stats_shape) + σ² = reshape(l.σ², stats_shape) + else # trainmode or testmode without tracked stats + μ = mean(x; dims = reduce_dims) + σ² = var(x; mean = μ, dims = reduce_dims, corrected = false) + if l.track_stats + _track_stats!(l, x, μ, σ², reduce_dims) # update moving mean/std + end end - end - o = _norm_layer_forward(x, μ, σ², l.ϵ) - hasaffine(l) || return l.λ.(o) + o = _norm_layer_forward(x, μ, σ², l.ϵ) + hasaffine(l) || return l.λ.(o) - γ = reshape(l.γ, affine_shape) - β = reshape(l.β, affine_shape) - return l.λ.(γ .* o .+ β) + γ = reshape(l.γ, affine_shape) + β = reshape(l.β, affine_shape) + return l.λ.(γ .* o .+ β) end @inline _norm_layer_forward(x, μ, σ², ϵ) = (x .- μ) ./ sqrt.(σ² .+ ϵ) -function _track_stats!( - bn, x::AbstractArray{T, N}, μ, σ², reduce_dims, -) where {T, N} - V = eltype(bn.σ²) - mtm = bn.momentum - res_mtm = one(V) - mtm - m = prod(size(x, i) for i in reduce_dims) +function _track_stats!(bn, x::AbstractArray{T, N}, μ, σ², reduce_dims) where {T, N} + V = eltype(bn.σ²) + mtm = bn.momentum + res_mtm = one(V) - mtm + m = prod(size(x, i) for i in reduce_dims) - μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims=N)) - σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims=N)) + μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims = N)) + σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims = N)) - bn.μ = res_mtm .* bn.μ .+ mtm .* μnew - bn.σ² = res_mtm .* bn.σ² .+ mtm .* (m / (m - one(V))) .* σ²new - return nothing + bn.μ = res_mtm .* bn.μ .+ mtm .* μnew + bn.σ² = res_mtm .* bn.σ² .+ mtm .* (m / (m - one(V))) .* σ²new + return nothing end ChainRulesCore.@non_differentiable _track_stats!(::Any...) @@ -309,6 +314,7 @@ that will be used to renormalize the input in test phase. Use [`testmode!`](@ref) during inference. # Examples + ```julia julia> using Statistics @@ -318,62 +324,61 @@ julia> m = BatchNorm(3); julia> Flux.trainmode!(m); -julia> isapprox(std(m(xs)), 1, atol=0.1) && std(xs) != std(m(xs)) +julia> isapprox(std(m(xs)), 1, atol = 0.1) && std(xs) != std(m(xs)) true ``` """ -mutable struct BatchNorm{F,V,N,W} - λ::F # activation function - β::V # bias - γ::V # scale - μ::W # moving mean - σ²::W # moving var - ϵ::N - momentum::N - affine::Bool - track_stats::Bool - active::Union{Bool, Nothing} - chs::Int # number of channels +mutable struct BatchNorm{F, V, N, W} + λ::F # activation function + β::V # bias + γ::V # scale + μ::W # moving mean + σ²::W # moving var + ϵ::N + momentum::N + affine::Bool + track_stats::Bool + active::Union{Bool, Nothing} + chs::Int # number of channels end -function BatchNorm(chs::Int, λ=identity; - initβ=zeros32, initγ=ones32, - affine=true, track_stats=true, - ϵ=1f-5, momentum=0.1f0) - - β = affine ? initβ(chs) : nothing - γ = affine ? initγ(chs) : nothing - μ = track_stats ? zeros32(chs) : nothing - σ² = track_stats ? ones32(chs) : nothing - - return BatchNorm(λ, β, γ, - μ, σ², ϵ, momentum, - affine, track_stats, - nothing, chs) +function BatchNorm(chs::Int, λ = identity; + initβ = zeros32, initγ = ones32, + affine = true, track_stats = true, + ϵ = 1.0f-5, momentum = 0.1f0) + β = affine ? initβ(chs) : nothing + γ = affine ? initγ(chs) : nothing + μ = track_stats ? zeros32(chs) : nothing + σ² = track_stats ? ones32(chs) : nothing + + return BatchNorm(λ, β, γ, + μ, σ², ϵ, momentum, + affine, track_stats, + nothing, chs) end @functor BatchNorm trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;) function (BN::BatchNorm)(x) - @assert size(x, ndims(x)-1) == BN.chs - N = ndims(x) - reduce_dims = [1:N-2; N] - affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) - return _norm_layer_forward(BN, x; reduce_dims, affine_shape) + @assert size(x, ndims(x) - 1) == BN.chs + N = ndims(x) + reduce_dims = [1:(N - 2); N] + affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) + return _norm_layer_forward(BN, x; reduce_dims, affine_shape) end -testmode!(m::BatchNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +function testmode!(m::BatchNorm, mode = true) + return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +end function Base.show(io::IO, l::BatchNorm) - print(io, "BatchNorm($(l.chs)") - (l.λ == identity) || print(io, ", $(l.λ)") - hasaffine(l) || print(io, ", affine=false") - print(io, ")") + print(io, "BatchNorm($(l.chs)") + (l.λ == identity) || print(io, ", $(l.λ)") + hasaffine(l) || print(io, ", affine=false") + return print(io, ")") end - """ InstanceNorm(channels::Integer, λ=identity; initβ=zeros32, initγ=ones32, @@ -399,6 +404,7 @@ that will be used to renormalize the input in test phase. in previous Flux versions (< v0.12). # Examples + ```jldoctest julia> using Statistics @@ -408,64 +414,66 @@ julia> m = InstanceNorm(3); julia> y = m(xs); -julia> isapprox(std(y, dims=1:2), ones(1, 1, 3, 2), atol=0.2) && std(y, dims=1:2) != std(xs, dims=1:2) +julia> isapprox(std(y, dims = 1:2), ones(1, 1, 3, 2), atol = 0.2) && + std(y, dims = 1:2) != std(xs, dims = 1:2) true ``` """ -mutable struct InstanceNorm{F,V,N,W} - λ::F # activation function - β::V # bias - γ::V # scale - μ::W # moving mean - σ²::W # moving var - ϵ::N - momentum::N - affine::Bool - track_stats::Bool - active::Union{Bool, Nothing} - chs::Int # number of channels +mutable struct InstanceNorm{F, V, N, W} + λ::F # activation function + β::V # bias + γ::V # scale + μ::W # moving mean + σ²::W # moving var + ϵ::N + momentum::N + affine::Bool + track_stats::Bool + active::Union{Bool, Nothing} + chs::Int # number of channels end -function InstanceNorm(chs::Int, λ=identity; - initβ=zeros32, initγ=ones32, - affine=false, track_stats=false, - ϵ=1f-5, momentum=0.1f0) - - if track_stats - Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", :InstanceNorm) - end +function InstanceNorm(chs::Int, λ = identity; + initβ = zeros32, initγ = ones32, + affine = false, track_stats = false, + ϵ = 1.0f-5, momentum = 0.1f0) + if track_stats + Base.depwarn("`track_stats=true` will be removed from InstanceNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :InstanceNorm) + end - β = affine ? initβ(chs) : nothing - γ = affine ? initγ(chs) : nothing - μ = track_stats ? zeros32(chs) : nothing - σ² = track_stats ? ones32(chs) : nothing + β = affine ? initβ(chs) : nothing + γ = affine ? initγ(chs) : nothing + μ = track_stats ? zeros32(chs) : nothing + σ² = track_stats ? ones32(chs) : nothing - return InstanceNorm(λ, β, γ, - μ, σ², ϵ, momentum, - affine, track_stats, - nothing, chs) + return InstanceNorm(λ, β, γ, + μ, σ², ϵ, momentum, + affine, track_stats, + nothing, chs) end @functor InstanceNorm trainable(in::InstanceNorm) = hasaffine(in) ? (β = in.β, γ = in.γ) : (;) function (l::InstanceNorm)(x) - @assert ndims(x) > 2 - @assert size(x, ndims(x)-1) == l.chs - N = ndims(x) - reduce_dims = 1:N-2 - affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) - return _norm_layer_forward(l, x; reduce_dims, affine_shape) + @assert ndims(x) > 2 + @assert size(x, ndims(x) - 1) == l.chs + N = ndims(x) + reduce_dims = 1:(N - 2) + affine_shape = ntuple(i -> i == N - 1 ? size(x, N - 1) : 1, N) + return _norm_layer_forward(l, x; reduce_dims, affine_shape) end -testmode!(m::InstanceNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +function testmode!(m::InstanceNorm, mode = true) + return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +end function Base.show(io::IO, l::InstanceNorm) - print(io, "InstanceNorm($(l.chs)") - l.λ == identity || print(io, ", $(l.λ)") - hasaffine(l) || print(io, ", affine=false") - print(io, ")") + print(io, "InstanceNorm($(l.chs)") + l.λ == identity || print(io, ", $(l.λ)") + hasaffine(l) || print(io, ", affine=false") + return print(io, ")") end """ @@ -494,6 +502,7 @@ If `track_stats=true`, accumulates mean and var statistics in training phase that will be used to renormalize the input in test phase. # Examples + ```jldoctest julia> using Statistics @@ -503,77 +512,82 @@ julia> m = GroupNorm(4, 2); julia> y = m(xs); -julia> isapprox(std(y[:, :, 1:2, 1]), 1, atol=0.1) && std(xs[:, :, 1:2, 1]) != std(y[:, :, 1:2, 1]) +julia> isapprox(std(y[:, :, 1:2, 1]), 1, atol = 0.1) && + std(xs[:, :, 1:2, 1]) != std(y[:, :, 1:2, 1]) true -julia> isapprox(std(y[:, :, 3:4, 2]), 1, atol=0.1) && std(xs[:, :, 3:4, 2]) != std(y[:, :, 3:4, 2]) +julia> isapprox(std(y[:, :, 3:4, 2]), 1, atol = 0.1) && + std(xs[:, :, 3:4, 2]) != std(y[:, :, 3:4, 2]) true +``` # number of groups ``` """ -mutable struct GroupNorm{F,V,N,W} - G::Int # number of groups - λ::F # activation function - β::V # bias - γ::V # scale - μ::W # moving mean - σ²::W # moving std - ϵ::N - momentum::N - affine::Bool - track_stats::Bool - active::Union{Bool, Nothing} - chs::Int # number of channels +mutable struct GroupNorm{F, V, N, W} + G::Int # number of groups + λ::F # activation function + β::V # bias + γ::V # scale + μ::W # moving mean + σ²::W # moving std + ϵ::N + momentum::N + affine::Bool + track_stats::Bool + active::Union{Bool, Nothing} + chs::Int # number of channels end @functor GroupNorm trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) -function GroupNorm(chs::Int, G::Int, λ=identity; - initβ=zeros32, initγ=ones32, - affine=true, track_stats=false, - ϵ=1f-5, momentum=0.1f0) - -if track_stats - Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", :GroupNorm) -end +function GroupNorm(chs::Int, G::Int, λ = identity; + initβ = zeros32, initγ = ones32, + affine = true, track_stats = false, + ϵ = 1.0f-5, momentum = 0.1f0) + if track_stats + Base.depwarn("`track_stats=true` will be removed from GroupNorm in Flux 0.14. The default value is `track_stats=false`, which will work as before.", + :GroupNorm) + end - chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)") + chs % G == 0 || + error("The number of groups ($(G)) must divide the number of channels ($chs)") - β = affine ? initβ(chs) : nothing - γ = affine ? initγ(chs) : nothing - μ = track_stats ? zeros32(G) : nothing - σ² = track_stats ? ones32(G) : nothing + β = affine ? initβ(chs) : nothing + γ = affine ? initγ(chs) : nothing + μ = track_stats ? zeros32(G) : nothing + σ² = track_stats ? ones32(G) : nothing - return GroupNorm(G, λ, - β, γ, - μ, σ², - ϵ, momentum, - affine, track_stats, - nothing, chs) + return GroupNorm(G, λ, + β, γ, + μ, σ², + ϵ, momentum, + affine, track_stats, + nothing, chs) end function (gn::GroupNorm)(x) - @assert ndims(x) > 2 - @assert size(x, ndims(x)-1) == gn.chs - N = ndims(x) - sz = size(x) - x = reshape(x, sz[1:N-2]..., sz[N-1]÷gn.G, gn.G, sz[N]) - N = ndims(x) - reduce_dims = 1:N-2 - affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) - x = _norm_layer_forward(gn, x; reduce_dims, affine_shape) - return reshape(x, sz) + @assert ndims(x) > 2 + @assert size(x, ndims(x) - 1) == gn.chs + N = ndims(x) + sz = size(x) + x = reshape(x, sz[1:(N - 2)]..., sz[N - 1] ÷ gn.G, gn.G, sz[N]) + N = ndims(x) + reduce_dims = 1:(N - 2) + affine_shape = ntuple(i -> i ∈ (N - 1, N - 2) ? size(x, i) : 1, N) + x = _norm_layer_forward(gn, x; reduce_dims, affine_shape) + return reshape(x, sz) end -testmode!(m::GroupNorm, mode = true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +function testmode!(m::GroupNorm, mode = true) + return (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) +end function Base.show(io::IO, l::GroupNorm) - # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G) - print(io, "GroupNorm($(l.chs), $(l.G)") - l.λ == identity || print(io, ", ", l.λ) - hasaffine(l) || print(io, ", affine=false") - print(io, ")") + # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G) + print(io, "GroupNorm($(l.chs), $(l.G)") + l.λ == identity || print(io, ", ", l.λ) + hasaffine(l) || print(io, ", affine=false") + return print(io, ")") end """ diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl index 7cabc9d5b6..ef19714f1f 100644 --- a/src/layers/recurrent.jl +++ b/src/layers/recurrent.jl @@ -1,52 +1,52 @@ -gate(h, n) = (1:h) .+ h*(n-1) -gate(x::AbstractVector, h, n) = @view x[gate(h,n)] -gate(x::AbstractMatrix, h, n) = view(x, gate(h,n), :) +gate(h, n) = (1:h) .+ h * (n - 1) +gate(x::AbstractVector, h, n) = @view x[gate(h, n)] +gate(x::AbstractMatrix, h, n) = view(x, gate(h, n), :) # AD-friendly helper for dividing monolithic RNN params into equally sized gates -multigate(x::AbstractArray, h, ::Val{N}) where N = ntuple(n -> gate(x,h,n), N) +multigate(x::AbstractArray, h, ::Val{N}) where {N} = ntuple(n -> gate(x, h, n), N) function ChainRulesCore.rrule(::typeof(multigate), x::AbstractArray, h, c) - function multigate_pullback(dy) - dx = map!(zero, similar(x, float(eltype(x)), axes(x)), x) - foreach(multigate(dx, h, c), unthunk(dy)) do dxᵢ, dyᵢ - dyᵢ isa AbstractZero && return - @. dxᵢ += dyᵢ + function multigate_pullback(dy) + dx = map!(zero, similar(x, float(eltype(x)), axes(x)), x) + foreach(multigate(dx, h, c), unthunk(dy)) do dxᵢ, dyᵢ + dyᵢ isa AbstractZero && return + @. dxᵢ += dyᵢ + end + return (NoTangent(), dx, NoTangent(), NoTangent()) end - return (NoTangent(), dx, NoTangent(), NoTangent()) - end - return multigate(x, h, c), multigate_pullback + return multigate(x, h, c), multigate_pullback end # Type stable and AD-friendly helper for iterating over the last dimension of an array -function eachlastdim(A::AbstractArray{T,N}) where {T,N} - inds_before = ntuple(_ -> :, N-1) - return (view(A, inds_before..., i) for i in axes(A, N)) +function eachlastdim(A::AbstractArray{T, N}) where {T, N} + inds_before = ntuple(_ -> :, N - 1) + return (view(A, inds_before..., i) for i in axes(A, N)) end # adapted from https://github.com/JuliaDiff/ChainRules.jl/blob/f13e0a45d10bb13f48d6208e9c9d5b4a52b96732/src/rulesets/Base/indexing.jl#L77 function ∇eachlastdim(dys_raw, x::AbstractArray{T, N}) where {T, N} - dys = unthunk(dys_raw) - i1 = findfirst(dy -> dy isa AbstractArray, dys) - if isnothing(i1) # all slices are Zero! - return fill!(similar(x, T, axes(x)), zero(T)) - end - # The whole point of this gradient is that we can allocate one `dx` array: - dx = similar(x, T, axes(x))::AbstractArray - for i in axes(x, N) - slice = selectdim(dx, N, i) - if dys[i] isa AbstractZero - fill!(slice, zero(eltype(slice))) - else - copyto!(slice, dys[i]) - end - end - return ProjectTo(x)(dx) + dys = unthunk(dys_raw) + i1 = findfirst(dy -> dy isa AbstractArray, dys) + if isnothing(i1) # all slices are Zero! + return fill!(similar(x, T, axes(x)), zero(T)) + end + # The whole point of this gradient is that we can allocate one `dx` array: + dx = similar(x, T, axes(x))::AbstractArray + for i in axes(x, N) + slice = selectdim(dx, N, i) + if dys[i] isa AbstractZero + fill!(slice, zero(eltype(slice))) + else + copyto!(slice, dys[i]) + end + end + return ProjectTo(x)(dx) end -function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T,N}) where {T,N} - lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x)) - collect(eachlastdim(x)), lastdims +function ChainRulesCore.rrule(::typeof(eachlastdim), x::AbstractArray{T, N}) where {T, N} + lastdims(dy) = (NoTangent(), ∇eachlastdim(unthunk(dy), x)) + return collect(eachlastdim(x)), lastdims end reshape_cell_output(h, x) = reshape(h, :, size(x)[2:end]...) @@ -64,6 +64,7 @@ in the background. `cell` should be a model of the form: For example, here's a recurrent network that keeps a running total of its inputs: # Examples + ```jldoctest julia> accum(h, x) = (h + x, x) accum (generic function with 1 method) @@ -71,7 +72,7 @@ accum (generic function with 1 method) julia> rnn = Flux.Recur(accum, 0) Recur(accum) -julia> rnn(2) +julia> rnn(2) 2 julia> rnn(3) @@ -125,14 +126,14 @@ julia> rnn.state 60 ``` """ -mutable struct Recur{T,S} - cell::T - state::S +mutable struct Recur{T, S} + cell::T + state::S end function (m::Recur)(x) - m.state, y = m.cell(m.state, x) - return y + m.state, y = m.cell(m.state, x) + return y end @functor Recur @@ -150,16 +151,17 @@ Assuming you have a `Recur` layer `rnn`, this is roughly equivalent to: rnn.state = hidden(rnn.cell) # Examples + ```jldoctest -julia> r = Flux.RNNCell(relu, ones(1,1), zeros(1,1), ones(1,1), zeros(1,1)); # users should use the RNN wrapper struct instead +julia> r = Flux.RNNCell(relu, ones(1, 1), zeros(1, 1), ones(1, 1), zeros(1, 1)); # users should use the RNN wrapper struct instead -julia> y = Flux.Recur(r, ones(1,1)); +julia> y = Flux.Recur(r, ones(1, 1)); julia> y.state 1×1 Matrix{Float64}: 1.0 -julia> y(ones(1,1)) # relu(1*1 + 1) +julia> y(ones(1, 1)) # relu(1*1 + 1) 1×1 Matrix{Float64}: 2.0 @@ -181,38 +183,44 @@ reset!(m) = foreach(reset!, functor(m)[1]) flip(f, xs) = reverse([f(x) for x in reverse(xs)]) -function (m::Recur)(x::AbstractArray{T, 3}) where T - h = [m(x_t) for x_t in eachlastdim(x)] - sze = size(h[1]) - reshape(reduce(hcat, h), sze[1], sze[2], length(h)) +function (m::Recur)(x::AbstractArray{T, 3}) where {T} + h = [m(x_t) for x_t in eachlastdim(x)] + sze = size(h[1]) + return reshape(reduce(hcat, h), sze[1], sze[2], length(h)) end # Vanilla RNN -struct RNNCell{F,I,H,V,S} - σ::F - Wi::I - Wh::H - b::V - state0::S +struct RNNCell{F, I, H, V, S} + σ::F + Wi::I + Wh::H + b::V + state0::S end -RNNCell((in, out)::Pair, σ=tanh; init=Flux.glorot_uniform, initb=zeros32, init_state=zeros32) = - RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out,1)) +function RNNCell((in, out)::Pair, σ = tanh; init = Flux.glorot_uniform, initb = zeros32, + init_state = zeros32) + return RNNCell(σ, init(out, in), init(out, out), initb(out), init_state(out, 1)) +end -function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {F,I,H,V,T} - Wi, Wh, b = m.Wi, m.Wh, m.b - σ = NNlib.fast_act(m.σ, x) - h = σ.(Wi*x .+ Wh*h .+ b) - return h, reshape_cell_output(h, x) +function (m::RNNCell{F, I, H, V, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {F, I, + H, V, T + } + Wi, Wh, b = m.Wi, m.Wh, m.b + σ = NNlib.fast_act(m.σ, x) + h = σ.(Wi * x .+ Wh * h .+ b) + return h, reshape_cell_output(h, x) end @functor RNNCell function Base.show(io::IO, l::RNNCell) - print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)) - l.σ == identity || print(io, ", ", l.σ) - print(io, ")") + print(io, "RNNCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)) + l.σ == identity || print(io, ", ", l.σ) + return print(io, ")") end """ @@ -226,6 +234,7 @@ The arguments `in` and `out` describe the size of the feature vectors passed as This constructor is syntactic sugar for `Recur(RNNCell(a...))`, and so RNNs are stateful. Note that the state shape can change depending on the inputs, and so it is good to `reset!` the model between inference calls if the batch size changes. See the examples below. # Examples + ```jldoctest julia> r = RNN(3 => 5) Recur( @@ -243,81 +252,93 @@ julia> r(rand(Float32, 3, 10)) |> size # batch size of 10 ``` !!! warning "Batch size changes" - + Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the following example: - + ```julia julia> r = RNN(3 => 5) Recur( RNNCell(3 => 5, tanh), # 50 parameters ) # Total: 4 trainable arrays, 50 parameters, # plus 1 non-trainable, 5 parameters, summarysize 432 bytes. - + julia> r.state |> size (5, 1) - + julia> r(rand(Float32, 3)) |> size (5,) - + julia> r.state |> size (5, 1) - + julia> r(rand(Float32, 3, 10)) |> size # batch size of 10 (5, 10) - + julia> r.state |> size # state shape has changed (5, 10) - + julia> r(rand(Float32, 3)) |> size # erroneously outputs a length 5*10 = 50 vector. (50,) ``` # Note: - `RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`. - ```julia - julia> using LinearAlgebra +`RNNCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type, but if `Wh` is `dxd`, then `Wi` should be of shape `dxN`. + +```julia +julia> using LinearAlgebra - julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) +julia> r = Flux.Recur(Flux.RNNCell(tanh, rand(5, 4), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) - julia> r(rand(4, 10)) |> size # batch size of 10 - (5, 10) - ``` +julia> r(rand(4, 10)) |> size # batch size of 10 +(5, 10) +``` """ RNN(a...; ka...) = Recur(RNNCell(a...; ka...)) Recur(m::RNNCell) = Recur(m, m.state0) # LSTM -struct LSTMCell{I,H,V,S} - Wi::I - Wh::H - b::V - state0::S +struct LSTMCell{I, H, V, S} + Wi::I + Wh::H + b::V + state0::S end function LSTMCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) - cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4), (init_state(out,1), init_state(out,1))) - cell.b[gate(out, 2)] .= 1 - return cell + cell = LSTMCell(init(out * 4, in), init(out * 4, out), initb(out * 4), + (init_state(out, 1), init_state(out, 1))) + cell.b[gate(out, 2)] .= 1 + return cell end -function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T} - b, o = m.b, size(h, 1) - g = muladd(m.Wi, x, muladd(m.Wh, h, b)) - input, forget, cell, output = multigate(g, o, Val(4)) - c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell) - h′ = @. sigmoid_fast(output) * tanh_fast(c′) - return (h′, c′), reshape_cell_output(h′, x) +function (m::LSTMCell{I, H, V, <:NTuple{2, AbstractMatrix{T}}})((h, c), + x::Union{ + AbstractVecOrMat{T + }, + OneHotArray}) where { + I, + H, + V, + T + } + b, o = m.b, size(h, 1) + g = muladd(m.Wi, x, muladd(m.Wh, h, b)) + input, forget, cell, output = multigate(g, o, Val(4)) + c′ = @. sigmoid_fast(forget) * c + sigmoid_fast(input) * tanh_fast(cell) + h′ = @. sigmoid_fast(output) * tanh_fast(c′) + return (h′, c′), reshape_cell_output(h′, x) end @functor LSTMCell -Base.show(io::IO, l::LSTMCell) = - print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷4, ")") +function Base.show(io::IO, l::LSTMCell) + return print(io, "LSTMCell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 4, ")") +end """ LSTM(in => out) @@ -333,6 +354,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) for a good overview of the internals. # Examples + ```jldoctest julia> l = LSTM(3 => 5) Recur( @@ -350,10 +372,12 @@ julia> l(rand(Float32, 3, 10)) |> size # batch size of 10 ``` !!! warning "Batch size changes" + Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`LSTMCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ LSTM(a...; ka...) = Recur(LSTMCell(a...; ka...)) Recur(m::LSTMCell) = Recur(m, m.state0) @@ -361,34 +385,42 @@ Recur(m::LSTMCell) = Recur(m, m.state0) # GRU function _gru_output(gxs, ghs, bs) - r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1]) - z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2]) - return r, z + r = @. sigmoid_fast(gxs[1] + ghs[1] + bs[1]) + z = @. sigmoid_fast(gxs[2] + ghs[2] + bs[2]) + return r, z end -struct GRUCell{I,H,V,S} - Wi::I - Wh::H - b::V - state0::S +struct GRUCell{I, H, V, S} + Wi::I + Wh::H + b::V + state0::S end -GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) = - GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), init_state(out,1)) +function GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, + init_state = zeros32) + return GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), + init_state(out, 1)) +end -function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,T} - Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1) - gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3)) - r, z = _gru_output(gxs, ghs, bs) - h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) - h′ = @. (1 - z) * h̃ + z * h - return h′, reshape_cell_output(h′, x) +function (m::GRUCell{I, H, V, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {I, H, V, T + } + Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1) + gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(3)), + multigate(b, o, Val(3)) + r, z = _gru_output(gxs, ghs, bs) + h̃ = @. tanh_fast(gxs[3] + r * ghs[3] + bs[3]) + h′ = @. (1 - z) * h̃ + z * h + return h′, reshape_cell_output(h′, x) end @functor GRUCell -Base.show(io::IO, l::GRUCell) = - print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")") +function Base.show(io::IO, l::GRUCell) + return print(io, "GRUCell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 3, ")") +end """ GRU(in => out) @@ -405,6 +437,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) for a good overview of the internals. # Examples + ```jldoctest julia> g = GRU(3 => 5) Recur( @@ -422,41 +455,53 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10 ``` !!! warning "Batch size changes" + Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`GRUCell`s can be constructed directly by specifying the non-linear function, the `Wi` and `Wh` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi` and `Wh` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ GRU(a...; ka...) = Recur(GRUCell(a...; ka...)) Recur(m::GRUCell) = Recur(m, m.state0) # GRU v3 -struct GRUv3Cell{I,H,V,HH,S} - Wi::I - Wh::H - b::V - Wh_h̃::HH - state0::S +struct GRUv3Cell{I, H, V, HH, S} + Wi::I + Wh::H + b::V + Wh_h̃::HH + state0::S +end + +function GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, + init_state = zeros32) + return GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3), + init(out, out), init_state(out, 1)) end -GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = zeros32) = - GRUv3Cell(init(out * 3, in), init(out * 2, out), initb(out * 3), - init(out, out), init_state(out,1)) - -function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{T},OneHotArray}) where {I,H,V,HH,T} - Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1) - gxs, ghs, bs = multigate(Wi*x, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3)) - r, z = _gru_output(gxs, ghs, bs) - h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) - h′ = @. (1 - z) * h̃ + z * h - return h′, reshape_cell_output(h′, x) +function (m::GRUv3Cell{I, H, V, HH, <:AbstractMatrix{T}})(h, + x::Union{AbstractVecOrMat{T}, + OneHotArray}) where {I, + H, + V, + HH, + T} + Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1) + gxs, ghs, bs = multigate(Wi * x, o, Val(3)), multigate(Wh * h, o, Val(2)), + multigate(b, o, Val(3)) + r, z = _gru_output(gxs, ghs, bs) + h̃ = tanh_fast.(gxs[3] .+ (Wh_h̃ * (r .* h)) .+ bs[3]) + h′ = @. (1 - z) * h̃ + z * h + return h′, reshape_cell_output(h′, x) end @functor GRUv3Cell -Base.show(io::IO, l::GRUv3Cell) = - print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1)÷3, ")") +function Base.show(io::IO, l::GRUv3Cell) + return print(io, "GRUv3Cell(", size(l.Wi, 2), " => ", size(l.Wi, 1) ÷ 3, ")") +end """ GRUv3(in => out) @@ -473,6 +518,7 @@ See [this article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) for a good overview of the internals. # Examples + ```jldoctest julia> g = GRUv3(3 => 5) Recur( @@ -490,10 +536,12 @@ julia> g(rand(Float32, 3, 10)) |> size # batch size of 10 ``` !!! warning "Batch size changes" + Failing to call `reset!` when the input batch size changes can lead to unexpected behavior. See the example in [`RNN`](@ref). # Note: - `GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref). + +`GRUv3Cell`s can be constructed directly by specifying the non-linear function, the `Wi`, `Wh`, and `Wh_h` internal matrices, a bias vector `b`, and a learnable initial state `state0`. The `Wi`, `Wh`, and `Wh_h` matrices do not need to be the same type. See the example in [`RNN`](@ref). """ GRUv3(a...; ka...) = Recur(GRUv3Cell(a...; ka...)) Recur(m::GRUv3Cell) = Recur(m, m.state0) diff --git a/src/layers/show.jl b/src/layers/show.jl index 421131f365..b2f272b9ef 100644 --- a/src/layers/show.jl +++ b/src/layers/show.jl @@ -1,47 +1,48 @@ for T in [ - :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion # container types - ] - @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) - if get(io, :typeinfo, nothing) === nothing # e.g. top level in REPL - _big_show(io, x) - elseif !get(io, :compact, false) # e.g. printed inside a Vector, but not a Matrix - _layer_show(io, x) - else - show(io, x) + :Chain, :Parallel, :SkipConnection, :Recur, :Maxout, :PairwiseFusion, # container types +] + @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) + if get(io, :typeinfo, nothing) === nothing # e.g. top level in REPL + _big_show(io, x) + elseif !get(io, :compact, false) # e.g. printed inside a Vector, but not a Matrix + _layer_show(io, x) + else + show(io, x) + end end - end end -function _big_show(io::IO, obj, indent::Int=0, name=nothing) - pre, post = obj isa Chain{<:AbstractVector} ? ("([", "])") : ("(", ")") - children = _show_children(obj) - if all(_show_leaflike, children) - _layer_show(io, obj, indent, name) - else - println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), pre) - if obj isa Chain{<:NamedTuple} && children == getfield(obj, :layers) - # then we insert names -- can this be done more generically? - for k in Base.keys(obj) - _big_show(io, obj[k], indent+2, k) - end - elseif obj isa Parallel{<:Any, <:NamedTuple} || obj isa PairwiseFusion{<:Any, <:NamedTuple} - _big_show(io, obj.connection, indent+2) - for k in Base.keys(obj) - _big_show(io, obj[k], indent+2, k) - end - else - for c in children - _big_show(io, c, indent+2) - end - end - if indent == 0 # i.e. this is the outermost container - print(io, rpad(post, 2)) - _big_finale(io, obj) +function _big_show(io::IO, obj, indent::Int = 0, name = nothing) + pre, post = obj isa Chain{<:AbstractVector} ? ("([", "])") : ("(", ")") + children = _show_children(obj) + if all(_show_leaflike, children) + _layer_show(io, obj, indent, name) else - println(io, " "^indent, post, ",") + println(io, " "^indent, isnothing(name) ? "" : "$name = ", nameof(typeof(obj)), pre) + if obj isa Chain{<:NamedTuple} && children == getfield(obj, :layers) + # then we insert names -- can this be done more generically? + for k in Base.keys(obj) + _big_show(io, obj[k], indent + 2, k) + end + elseif obj isa Parallel{<:Any, <:NamedTuple} || + obj isa PairwiseFusion{<:Any, <:NamedTuple} + _big_show(io, obj.connection, indent + 2) + for k in Base.keys(obj) + _big_show(io, obj[k], indent + 2, k) + end + else + for c in children + _big_show(io, c, indent + 2) + end + end + if indent == 0 # i.e. this is the outermost container + print(io, rpad(post, 2)) + _big_finale(io, obj) + else + println(io, " "^indent, post, ",") + end end - end end _show_leaflike(x) = isleaf(x) # mostly follow Functors, except for: @@ -59,67 +60,75 @@ _show_children(f::PairwiseFusion) = (f.connection, f.layers...) for T in [ :Conv, :ConvTranspose, :CrossCor, :Dense, :Scale, :Bilinear, :Embedding, :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm, - ] - @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) - if !get(io, :compact, false) - _layer_show(io, x) - else - show(io, x) +] + @eval function Base.show(io::IO, m::MIME"text/plain", x::$T) + if !get(io, :compact, false) + _layer_show(io, x) + else + show(io, x) + end end - end end -function _layer_show(io::IO, layer, indent::Int=0, name=nothing) - _str = isnothing(name) ? "" : "$name = " - str = _str * sprint(show, layer, context=io) - print(io, " "^indent, str, indent==0 ? "" : ",") - if !isempty(params(layer)) - print(io, " "^max(2, (indent==0 ? 20 : 39) - indent - length(str))) - printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters"; color=:light_black) - nonparam = _childarray_sum(length, layer) - sum(length, params(layer)) - if nonparam > 0 - printstyled(io, ", plus ", underscorise(nonparam), indent==0 ? " non-trainable" : ""; color=:light_black) +function _layer_show(io::IO, layer, indent::Int = 0, name = nothing) + _str = isnothing(name) ? "" : "$name = " + str = _str * sprint(show, layer, context = io) + print(io, " "^indent, str, indent == 0 ? "" : ",") + if !isempty(params(layer)) + print(io, " "^max(2, (indent == 0 ? 20 : 39) - indent - length(str))) + printstyled(io, "# ", underscorise(sum(length, params(layer))), " parameters"; + color = :light_black) + nonparam = _childarray_sum(length, layer) - sum(length, params(layer)) + if nonparam > 0 + printstyled(io, ", plus ", underscorise(nonparam), + indent == 0 ? " non-trainable" : ""; color = :light_black) + end + _nan_show(io, params(layer)) end - _nan_show(io, params(layer)) - end - indent==0 || println(io) + return indent == 0 || println(io) end function _big_finale(io::IO, m) - ps = params(m) - if length(ps) > 2 - pars = underscorise(sum(length, ps)) - bytes = Base.format_bytes(Base.summarysize(m)) - noncnt = _childarray_sum(_->1, m) - length(ps) - if noncnt > 0 - nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps)) - printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, "; color=:light_black) - println(io, pars, " parameters,") - printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam, " parameters, summarysize "; color=:light_black) - print(io, bytes, ".") - else - printstyled(io, " "^18, "# Total: ", length(ps), " arrays, "; color=:light_black) - print(io, pars, " parameters, ", bytes, ".") + ps = params(m) + if length(ps) > 2 + pars = underscorise(sum(length, ps)) + bytes = Base.format_bytes(Base.summarysize(m)) + noncnt = _childarray_sum(_ -> 1, m) - length(ps) + if noncnt > 0 + nonparam = underscorise(_childarray_sum(length, m) - sum(length, ps)) + printstyled(io, " "^08, "# Total: ", length(ps), " trainable arrays, "; + color = :light_black) + println(io, pars, " parameters,") + printstyled(io, " "^10, "# plus ", noncnt, " non-trainable, ", nonparam, + " parameters, summarysize "; color = :light_black) + print(io, bytes, ".") + else + printstyled(io, " "^18, "# Total: ", length(ps), " arrays, "; + color = :light_black) + print(io, pars, " parameters, ", bytes, ".") + end end - end end _childarray_sum(f, x::AbstractArray{<:Number}) = f(x) -_childarray_sum(f, x) = isleaf(x) ? 0 : sum(y -> _childarray_sum(f, y), Functors.children(x)) +function _childarray_sum(f, x) + return isleaf(x) ? 0 : sum(y -> _childarray_sum(f, y), Functors.children(x)) +end # utility functions -underscorise(n::Integer) = - join(reverse(join.(reverse.(Iterators.partition(digits(n), 3)))), '_') +function underscorise(n::Integer) + return join(reverse(join.(reverse.(Iterators.partition(digits(n), 3)))), '_') +end function _nan_show(io::IO, x) - if !isempty(x) && _all(iszero, x) - printstyled(io, " (all zero)", color=:cyan) - elseif _any(isnan, x) - printstyled(io, " (some NaN)", color=:red) - elseif _any(isinf, x) - printstyled(io, " (some Inf)", color=:red) - end + if !isempty(x) && _all(iszero, x) + printstyled(io, " (all zero)", color = :cyan) + elseif _any(isnan, x) + printstyled(io, " (some NaN)", color = :red) + elseif _any(isinf, x) + printstyled(io, " (some Inf)", color = :red) + end end _any(f, xs::AbstractArray{<:Number}) = any(f, xs) diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 06c8b6a4a9..b772fbbcc2 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -7,13 +7,14 @@ preserving the size of the last dimension. See also [`unsqueeze`](@ref). # Examples + ```jldoctest -julia> rand(3,4,5) |> Flux.flatten |> size +julia> rand(3, 4, 5) |> Flux.flatten |> size (12, 5) -julia> xs = rand(Float32, 10,10,3,7); +julia> xs = rand(Float32, 10, 10, 3, 7); -julia> m = Chain(Conv((3,3), 3 => 4, pad=1), Flux.flatten, Dense(400 => 33)); +julia> m = Chain(Conv((3, 3), 3 => 4, pad = 1), Flux.flatten, Dense(400 => 33)); julia> xs |> m[1] |> size (10, 10, 4, 7) @@ -23,17 +24,18 @@ julia> xs |> m |> size ``` """ function flatten(x::AbstractArray) - return reshape(x, :, size(x)[end]) + return reshape(x, :, size(x)[end]) end """ normalise(x; dims=ndims(x), ϵ=1e-5) Normalise `x` to mean 0 and standard deviation 1 across the dimension(s) given by `dims`. -Per default, `dims` is the last dimension. +Per default, `dims` is the last dimension. `ϵ` is a small additive factor added to the denominator for numerical stability. # Examples + ```jldoctest julia> using Statistics @@ -41,19 +43,19 @@ julia> x = [9, 10, 20, 60]; julia> y = Flux.normalise(x); -julia> isapprox(std(y), 1, atol=0.2) && std(y) != std(x) +julia> isapprox(std(y), 1, atol = 0.2) && std(y) != std(x) true julia> x = rand(1:100, 10, 2); -julia> y = Flux.normalise(x, dims=1); +julia> y = Flux.normalise(x, dims = 1); -julia> isapprox(std(y, dims=1), ones(1, 2), atol=0.2) && std(y, dims=1) != std(x, dims=1) +julia> isapprox(std(y, dims = 1), ones(1, 2), atol = 0.2) && std(y, dims = 1) != std(x, dims = 1) true ``` """ -@inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5)) - μ = mean(x, dims=dims) - σ = std(x, dims=dims, mean=μ, corrected=false) - return @. (x - μ) / (σ + ϵ) +@inline function normalise(x::AbstractArray; dims = ndims(x), ϵ = ofeltype(x, 1e-5)) + μ = mean(x, dims = dims) + σ = std(x, dims = dims, mean = μ, corrected = false) + return @. (x - μ) / (σ + ϵ) end diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl index c71a9acc8d..2d50074833 100644 --- a/src/layers/upsample.jl +++ b/src/layers/upsample.jl @@ -4,13 +4,14 @@ An upsampling layer. One of two keywords must be given: -If `scale` is a number, this applies to all but the last two dimensions (channel and batch) of the input. -It may also be a tuple, to control dimensions individually. Alternatively, keyword +If `scale` is a number, this applies to all but the last two dimensions (channel and batch) of the input. +It may also be a tuple, to control dimensions individually. Alternatively, keyword `size` accepts a tuple, to directly specify the leading dimensions of the output. -Currently supported upsampling `mode`s +Currently supported upsampling `mode`s and corresponding NNlib's methods are: - - `:nearest` -> [`NNlib.upsample_nearest`](@ref) + + - `:nearest` -> [`NNlib.upsample_nearest`](@ref) - `:bilinear` -> [`NNlib.upsample_bilinear`](@ref) - `:trilinear` -> [`NNlib.upsample_trilinear`](@ref) @@ -31,45 +32,45 @@ julia> m(ones(2, 2, 1, 1)) |> size ``` """ struct Upsample{mode, S, T} - scale::S - size::T + scale::S + size::T end function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing) - mode in [:nearest, :bilinear, :trilinear] || - throw(ArgumentError("mode=:$mode is not supported.")) - if !(isnothing(scale) ⊻ isnothing(size)) - throw(ArgumentError("Either scale or size should be specified (but not both).")) - end - return Upsample{mode,typeof(scale),typeof(size)}(scale, size) + mode in [:nearest, :bilinear, :trilinear] || + throw(ArgumentError("mode=:$mode is not supported.")) + if !(isnothing(scale) ⊻ isnothing(size)) + throw(ArgumentError("Either scale or size should be specified (but not both).")) + end + return Upsample{mode, typeof(scale), typeof(size)}(scale, size) end Upsample(scale, mode::Symbol = :nearest) = Upsample(mode; scale) -(m::Upsample{:nearest})(x::AbstractArray) = - NNlib.upsample_nearest(x, m.scale) -function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N} - NNlib.upsample_nearest(x, ntuple(i -> m.scale, N-2)) +(m::Upsample{:nearest})(x::AbstractArray) = NNlib.upsample_nearest(x, m.scale) +function (m::Upsample{:nearest, Int})(x::AbstractArray{T, N}) where {T, N} + return NNlib.upsample_nearest(x, ntuple(i -> m.scale, N - 2)) +end +function (m::Upsample{:nearest, Nothing})(x::AbstractArray) + return NNlib.upsample_nearest(x; size = m.size) end -(m::Upsample{:nearest, Nothing})(x::AbstractArray) = - NNlib.upsample_nearest(x; size=m.size) -(m::Upsample{:bilinear})(x::AbstractArray) = - NNlib.upsample_bilinear(x, m.scale) -(m::Upsample{:bilinear, Nothing})(x::AbstractArray) = - NNlib.upsample_bilinear(x; size=m.size) +(m::Upsample{:bilinear})(x::AbstractArray) = NNlib.upsample_bilinear(x, m.scale) +function (m::Upsample{:bilinear, Nothing})(x::AbstractArray) + return NNlib.upsample_bilinear(x; size = m.size) +end -(m::Upsample{:trilinear})(x::AbstractArray) = - NNlib.upsample_trilinear(x, m.scale) -(m::Upsample{:trilinear, Nothing})(x::AbstractArray) = - NNlib.upsample_trilinear(x; size=m.size) +(m::Upsample{:trilinear})(x::AbstractArray) = NNlib.upsample_trilinear(x, m.scale) +function (m::Upsample{:trilinear, Nothing})(x::AbstractArray) + return NNlib.upsample_trilinear(x; size = m.size) +end function Base.show(io::IO, u::Upsample{mode}) where {mode} - print(io, "Upsample(") - print(io, ":", mode) - u.scale !== nothing && print(io, ", scale = $(u.scale)") - u.size !== nothing && print(io, ", size = $(u.size)") - print(io, ")") + print(io, "Upsample(") + print(io, ":", mode) + u.scale !== nothing && print(io, ", scale = $(u.scale)") + u.size !== nothing && print(io, ", size = $(u.size)") + return print(io, ")") end """ @@ -77,14 +78,15 @@ end Pixel shuffling layer with upscale factor `r`. Usually used for generating higher resolution images while upscaling them. - + See [`NNlib.pixel_shuffle`](@ref). # Examples + ```jldoctest julia> p = PixelShuffle(2); -julia> xs = [2row + col + channel/10 for row in 1:2, col in 1:2, channel in 1:4, n in 1:1] +julia> xs = [2row + col + channel / 10 for row in 1:2, col in 1:2, channel in 1:4, n in 1:1] 2×2×4×1 Array{Float64, 4}: [:, :, 1, 1] = 3.1 4.1 @@ -110,7 +112,7 @@ julia> p(xs) 5.1 5.3 6.1 6.3 5.2 5.4 6.2 6.4 -julia> xs = [3row + col + channel/10 for row in 1:2, col in 1:3, channel in 1:4, n in 1:1] +julia> xs = [3row + col + channel / 10 for row in 1:2, col in 1:3, channel in 1:4, n in 1:1] 2×3×4×1 Array{Float64, 4}: [:, :, 1, 1] = 4.1 5.1 6.1 @@ -137,8 +139,8 @@ julia> p(xs) 7.2 7.4 8.2 8.4 9.2 9.4 ``` """ -struct PixelShuffle - r::Int +struct PixelShuffle + r::Int end (m::PixelShuffle)(x) = NNlib.pixel_shuffle(x, m.r) diff --git a/src/loading.jl b/src/loading.jl index 9098828a8b..d360150447 100644 --- a/src/loading.jl +++ b/src/loading.jl @@ -1,35 +1,46 @@ loadleaf!(dst, src, err) = dst -loadleaf!(dst::AbstractArray, src, err) = - error("Tried to copy $src into an array destination; this is not allowed.") -loadleaf!(dst, src::AbstractArray, err) = - error("Tried to copy an array to $dst; this is not allowed.") +function loadleaf!(dst::AbstractArray, src, err) + return error("Tried to copy $src into an array destination; this is not allowed.") +end +function loadleaf!(dst, src::AbstractArray, err) + return error("Tried to copy an array to $dst; this is not allowed.") +end function loadleaf!(dst::AbstractArray, src::Bool, err) - if iszero(src) - dst .= src - else - error("Cannot copy boolean parameter == true to non-zero parameter.") - end - return dst + if iszero(src) + dst .= src + else + error("Cannot copy boolean parameter == true to non-zero parameter.") + end + return dst +end +function loadleaf!(dst::Bool, src::AbstractArray, err) + return iszero(dst) ? dst : + error("Cannot copy non-zero parameter to boolean parameter == true.") end -loadleaf!(dst::Bool, src::AbstractArray, err) = iszero(dst) ? dst : - error("Cannot copy non-zero parameter to boolean parameter == true.") function loadleaf!(dst::AbstractArray, src::AbstractArray, err) - (size(dst) == size(src)) || throw(err) - copyto!(dst, src) + (size(dst) == size(src)) || throw(err) + return copyto!(dst, src) end -_tie_check(dst::Bool, src::AbstractArray) = iszero(dst) || - error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") -_tie_check(dst::AbstractArray, src::Bool) = (iszero(dst) && iszero(src)) || - error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") -_tie_check(dst::AbstractArray, src::AbstractArray) = (dst == src) || - error("Encountered tied destination parameters with untied and mismatched sources.") +function _tie_check(dst::Bool, src::AbstractArray) + return iszero(dst) || + error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") +end +function _tie_check(dst::AbstractArray, src::Bool) + return (iszero(dst) && iszero(src)) || + error("Encountered tied parameter with boolean source at some nodes and non-boolean sources at others.") +end +function _tie_check(dst::AbstractArray, src::AbstractArray) + return (dst == src) || + error("Encountered tied destination parameters with untied and mismatched sources.") +end _tie_check(dst, src) = true _bool_tie_check(dst, src) = true -_filter_children(f, children::NamedTuple) = - NamedTuple(filter(kv -> f(kv[2]), pairs(children))) +function _filter_children(f, children::NamedTuple) + return NamedTuple(filter(kv -> f(kv[2]), pairs(children))) +end _filter_children(f, children) = filter(f, children) """ @@ -44,8 +55,9 @@ Zero bias vectors and `bias=false` are considered equivalent (see extended help for more details). # Examples + ```julia -julia> dst = Chain(Dense(Flux.ones32(2, 5), Flux.ones32(2), tanh), Dense(2 => 1; bias = [1f0])) +julia> dst = Chain(Dense(Flux.ones32(2, 5), Flux.ones32(2), tanh), Dense(2 => 1; bias = [1.0f0])) Chain( Dense(5 => 2, tanh), # 12 parameters Dense(2 => 1), # 3 parameters @@ -54,7 +66,7 @@ Chain( julia> dst[1].weight ≈ ones(2, 5) # by construction true -julia> src = Chain(Dense(5 => 2, relu), Dense(2 => 1, bias=false)); +julia> src = Chain(Dense(5 => 2, relu), Dense(2 => 1, bias = false)); julia> Flux.loadmodel!(dst, src); @@ -68,12 +80,13 @@ true # Extended help Throws an error when: -- `dst` and `src` do not share the same fields (at any level) -- the sizes of leaf nodes are mismatched between `dst` and `src` -- copying non-array values to/from an array parameter - (except inactive parameters described below) -- `dst` is a "tied" parameter (i.e. refers to another parameter) and - loaded into multiple times with mismatched source values + + - `dst` and `src` do not share the same fields (at any level) + - the sizes of leaf nodes are mismatched between `dst` and `src` + - copying non-array values to/from an array parameter + (except inactive parameters described below) + - `dst` is a "tied" parameter (i.e. refers to another parameter) and + loaded into multiple times with mismatched source values Inactive parameters can be encoded by using the boolean value `false` instead of an array. If `dst == false` and `src` is an all-zero array, no error will be raised (and no values copied); @@ -82,22 +95,22 @@ Likewise, copying a `src` value of `false` to any `dst` array is valid, but copying a `src` value of `true` will error. """ function loadmodel!(dst, src; filter = _ -> true, cache = Base.IdSet()) - ldsts = _filter_children(filter, functor(dst)[1]) - lsrcs = _filter_children(filter, functor(src)[1]) - (keys(ldsts) == keys(lsrcs)) || - throw(ArgumentError("Tried to load $src into $dst but the structures do not match.")) - - err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.") - foreach(ldsts, lsrcs) do ldst, lsrc - if ldst in cache # we already loaded this parameter before - _tie_check(ldst, lsrc) && return ldst - elseif Functors.isleaf(ldst) # our first time loading this leaf - push!(cache, ldst) - loadleaf!(ldst, lsrc, err) - else # this isn't a leaf - loadmodel!(ldst, lsrc; filter = filter, cache = cache) + ldsts = _filter_children(filter, functor(dst)[1]) + lsrcs = _filter_children(filter, functor(src)[1]) + (keys(ldsts) == keys(lsrcs)) || + throw(ArgumentError("Tried to load $src into $dst but the structures do not match.")) + + err = DimensionMismatch("Tried to load $src into $dst but the parameter sizes do not match.") + foreach(ldsts, lsrcs) do ldst, lsrc + if ldst in cache # we already loaded this parameter before + _tie_check(ldst, lsrc) && return ldst + elseif Functors.isleaf(ldst) # our first time loading this leaf + push!(cache, ldst) + loadleaf!(ldst, lsrc, err) + else # this isn't a leaf + loadmodel!(ldst, lsrc; filter = filter, cache = cache) + end end - end - return dst + return dst end diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl index 3d8f6f8149..863d075916 100644 --- a/src/losses/Losses.jl +++ b/src/losses/Losses.jl @@ -10,16 +10,16 @@ using NNlib: logsoftmax, logσ, ctc_loss, ctc_alpha, ∇ctc_loss import Base.Broadcast: broadcasted export mse, mae, msle, - label_smoothing, - crossentropy, logitcrossentropy, - binarycrossentropy, logitbinarycrossentropy, - kldivergence, - huber_loss, - tversky_loss, - dice_coeff_loss, - poisson_loss, - hinge_loss, squared_hinge_loss, - binary_focal_loss, focal_loss, siamese_contrastive_loss + label_smoothing, + crossentropy, logitcrossentropy, + binarycrossentropy, logitbinarycrossentropy, + kldivergence, + huber_loss, + tversky_loss, + dice_coeff_loss, + poisson_loss, + hinge_loss, squared_hinge_loss, + binary_focal_loss, focal_loss, siamese_contrastive_loss include("utils.jl") include("functions.jl") diff --git a/src/losses/functions.jl b/src/losses/functions.jl index 1bb14b2e74..b42845fe62 100644 --- a/src/losses/functions.jl +++ b/src/losses/functions.jl @@ -11,6 +11,7 @@ Return the loss corresponding to mean absolute error: agg(abs.(ŷ .- y)) # Example + ```jldoctest julia> y_model = [1.1, 1.9, 3.1]; @@ -19,8 +20,8 @@ julia> Flux.mae(y_model, 1:3) ``` """ function mae(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg(abs.(ŷ .- y)) + _check_sizes(ŷ, y) + return agg(abs.(ŷ .- y)) end """ @@ -33,6 +34,7 @@ Return the loss corresponding to mean square error: See also: [`mae`](@ref), [`msle`](@ref), [`crossentropy`](@ref). # Example + ```jldoctest julia> y_model = [1.1, 1.9, 3.1]; @@ -43,8 +45,8 @@ julia> Flux.mse(y_model, y_true) ``` """ function mse(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg(abs2.(ŷ .- y)) + _check_sizes(ŷ, y) + return agg(abs2.(ŷ .- y)) end """ @@ -58,6 +60,7 @@ The `ϵ` term provides numerical stability. Penalizes an under-estimation more than an over-estimatation. # Example + ```jldoctest julia> Flux.msle(Float32[1.1, 2.2, 3.3], 1:3) 0.009084041f0 @@ -67,8 +70,8 @@ julia> Flux.msle(Float32[0.9, 1.8, 2.7], 1:3) ``` """ function msle(ŷ, y; agg = mean, ϵ = epseltype(ŷ)) - _check_sizes(ŷ, y) - agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^2 ) + _check_sizes(ŷ, y) + return agg((log.((ŷ .+ ϵ) ./ (y .+ ϵ))) .^ 2) end """ @@ -82,23 +85,24 @@ given the prediction `ŷ` and true values `y`. | δ * (|ŷ - y| - 0.5 * δ), otherwise # Example + ```jldoctest julia> ŷ = [1.1, 2.1, 3.1]; julia> Flux.huber_loss(ŷ, 1:3) # default δ = 1 > |ŷ - y| 0.005000000000000009 -julia> Flux.huber_loss(ŷ, 1:3, δ=0.05) # changes behaviour as |ŷ - y| > δ +julia> Flux.huber_loss(ŷ, 1:3, δ = 0.05) # changes behaviour as |ŷ - y| > δ 0.003750000000000005 ``` """ function huber_loss(ŷ, y; agg = mean, δ = ofeltype(ŷ, 1)) - _check_sizes(ŷ, y) - abs_error = abs.(ŷ .- y) - #TODO: remove dropgrad when Zygote can handle this function with CuArrays - temp = Zygote.dropgrad(abs_error .< δ) - x = ofeltype(ŷ, 0.5) - agg(((abs_error .^ 2) .* temp) .* x .+ δ * (abs_error .- x * δ) .* (1 .- temp)) + _check_sizes(ŷ, y) + abs_error = abs.(ŷ .- y) + #TODO: remove dropgrad when Zygote can handle this function with CuArrays + temp = Zygote.dropgrad(abs_error .< δ) + x = ofeltype(ŷ, 0.5) + return agg(((abs_error .^ 2) .* temp) .* x .+ δ * (abs_error .- x * δ) .* (1 .- temp)) end """ @@ -124,6 +128,7 @@ value of α larger the smoothing of `y`. of label smoothing to binary distributions encoded in a single number. # Example + ```jldoctest julia> y = Flux.onehotbatch([1, 1, 1, 0, 1, 0], 0:1) 2×6 OneHotMatrix(::Vector{UInt32}) with eltype Bool: @@ -135,12 +140,12 @@ julia> y_smoothed = Flux.label_smoothing(y, 0.2f0) 0.1 0.1 0.1 0.9 0.1 0.9 0.9 0.9 0.9 0.1 0.9 0.1 -julia> y_sim = softmax(y .* log(2f0)) +julia> y_sim = softmax(y .* log(2.0f0)) 2×6 Matrix{Float32}: 0.333333 0.333333 0.333333 0.666667 0.333333 0.666667 0.666667 0.666667 0.666667 0.333333 0.666667 0.333333 -julia> y_dis = vcat(y_sim[2,:]', y_sim[1,:]') +julia> y_dis = vcat(y_sim[2, :]', y_sim[1, :]') 2×6 Matrix{Float32}: 0.666667 0.666667 0.666667 0.333333 0.666667 0.333333 0.333333 0.333333 0.333333 0.666667 0.333333 0.666667 @@ -152,14 +157,14 @@ julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed) true ``` """ -function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int = 1) +function label_smoothing(y::Union{AbstractArray, Number}, α::Number; dims::Int = 1) if !(0 < α < 1) throw(ArgumentError("α must be between 0 and 1")) end if dims == 0 - y_smoothed = y .* (1 - α) .+ α*1//2 + y_smoothed = y .* (1 - α) .+ α * 1 // 2 elseif dims == 1 - y_smoothed = y .* (1 - α) .+ α* 1 // size(y, 1) + y_smoothed = y .* (1 - α) .+ α * 1 // size(y, 1) else throw(ArgumentError("`dims` should be either 0 or 1")) end @@ -189,6 +194,7 @@ computing the loss. See also: [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref). # Example + ```jldoctest julia> y_label = Flux.onehotbatch([0, 1, 2, 1, 0], 0:2) 3×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool: @@ -196,20 +202,20 @@ julia> y_label = Flux.onehotbatch([0, 1, 2, 1, 0], 0:2) ⋅ 1 ⋅ 1 ⋅ ⋅ ⋅ 1 ⋅ ⋅ -julia> y_model = softmax(reshape(-7:7, 3, 5) .* 1f0) +julia> y_model = softmax(reshape(-7:7, 3, 5) .* 1.0f0) 3×5 Matrix{Float32}: 0.0900306 0.0900306 0.0900306 0.0900306 0.0900306 0.244728 0.244728 0.244728 0.244728 0.244728 0.665241 0.665241 0.665241 0.665241 0.665241 -julia> sum(y_model; dims=1) +julia> sum(y_model; dims = 1) 1×5 Matrix{Float32}: 1.0 1.0 1.0 1.0 1.0 julia> Flux.crossentropy(y_model, y_label) 1.6076053f0 -julia> 5 * ans ≈ Flux.crossentropy(y_model, y_label; agg=sum) +julia> 5 * ans ≈ Flux.crossentropy(y_model, y_label; agg = sum) true julia> y_smooth = Flux.label_smoothing(y_label, 0.15f0) @@ -223,8 +229,8 @@ julia> Flux.crossentropy(y_model, y_smooth) ``` """ function crossentropy(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ)) - _check_sizes(ŷ, y) - agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims)) + _check_sizes(ŷ, y) + return agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims = dims)) end """ @@ -241,6 +247,7 @@ and [softmax](@ref Softmax) separately. See also: [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref), [`label_smoothing`](@ref). # Example + ```jldoctest julia> y_label = Flux.onehotbatch(collect("abcabaa"), 'a':'c') 3×7 OneHotMatrix(::Vector{UInt32}) with eltype Bool: @@ -262,8 +269,8 @@ julia> Flux.crossentropy(softmax(y_model), y_label) ``` """ function logitcrossentropy(ŷ, y; dims = 1, agg = mean) - _check_sizes(ŷ, y) - agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims)) + _check_sizes(ŷ, y) + return agg(.-sum(y .* logsoftmax(ŷ; dims = dims); dims = dims)) end """ @@ -283,22 +290,23 @@ computing the loss. See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref). # Examples + ```jldoctest -julia> y_bin = Bool[1,0,1] +julia> y_bin = Bool[1, 0, 1] 3-element Vector{Bool}: 1 0 1 -julia> y_prob = softmax(reshape(vcat(1:3, 3:5), 2, 3) .* 1f0) +julia> y_prob = softmax(reshape(vcat(1:3, 3:5), 2, 3) .* 1.0f0) 2×3 Matrix{Float32}: 0.268941 0.5 0.268941 0.731059 0.5 0.731059 -julia> Flux.binarycrossentropy(y_prob[2,:], y_bin) +julia> Flux.binarycrossentropy(y_prob[2, :], y_bin) 0.43989f0 -julia> all(p -> 0 < p < 1, y_prob[2,:]) # else DomainError +julia> all(p -> 0 < p < 1, y_prob[2, :]) # else DomainError true julia> y_hot = Flux.onehotbatch(y_bin, 0:1) @@ -311,8 +319,8 @@ julia> Flux.crossentropy(y_prob, y_hot) ``` """ function binarycrossentropy(ŷ, y; agg = mean, ϵ = epseltype(ŷ)) - _check_sizes(ŷ, y) - agg(@.(-xlogy(y, ŷ + ϵ) - xlogy(1 - y, 1 - ŷ + ϵ))) + _check_sizes(ŷ, y) + return agg(@.(-xlogy(y, ŷ + ϵ)-xlogy(1 - y, 1 - ŷ + ϵ))) end """ @@ -324,8 +332,9 @@ Mathematically equivalent to See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref). # Examples + ```jldoctest -julia> y_bin = Bool[1,0,1]; +julia> y_bin = Bool[1, 0, 1]; julia> y_model = Float32[2, -1, pi] 3-element Vector{Float32}: @@ -341,8 +350,8 @@ julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin) ``` """ function logitbinarycrossentropy(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg(@.((1 - y) * ŷ - logσ(ŷ))) + _check_sizes(ŷ, y) + return agg(@.((1 - y) * ŷ-logσ(ŷ))) end """ @@ -356,6 +365,7 @@ The KL divergence is a measure of how much one probability distribution is diffe from the other. It is always non-negative, and zero only when both the distributions are equal. # Example + ```jldoctest julia> p1 = [1 0; 0 1] 2×2 Matrix{Int64}: @@ -381,10 +391,10 @@ Inf ``` """ function kldivergence(ŷ, y; dims = 1, agg = mean, ϵ = epseltype(ŷ)) - _check_sizes(ŷ, y) - entropy = agg(sum(xlogx.(y), dims = dims)) - cross_entropy = crossentropy(ŷ, y; dims = dims, agg = agg, ϵ = ϵ) - return entropy + cross_entropy + _check_sizes(ŷ, y) + entropy = agg(sum(xlogx.(y), dims = dims)) + cross_entropy = crossentropy(ŷ, y; dims = dims, agg = agg, ϵ = ϵ) + return entropy + cross_entropy end """ @@ -398,6 +408,7 @@ distribution `y`; calculated as - [More information.](https://peltarion.com/knowledge-center/documentation/modeling-view/build-an-ai-model/loss-functions/poisson). # Example + ```jldoctest julia> y_model = [1, 3, 3]; # data should only take integral values @@ -406,8 +417,8 @@ julia> Flux.poisson_loss(y_model, 1:3) ``` """ function poisson_loss(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg(ŷ .- xlogy.(y, ŷ)) + _check_sizes(ŷ, y) + return agg(ŷ .- xlogy.(y, ŷ)) end """ @@ -422,6 +433,7 @@ Usually used with classifiers like Support Vector Machines. See also: [`squared_hinge_loss`](@ref) # Example + ```jldoctest julia> y_true = [1, -1, 1, 1]; @@ -441,8 +453,8 @@ true ``` """ function hinge_loss(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg(max.(0, 1 .- ŷ .* y)) + _check_sizes(ŷ, y) + return agg(max.(0, 1 .- ŷ .* y)) end """ @@ -457,6 +469,7 @@ Usually used with classifiers like Support Vector Machines. See also: [`hinge_loss`](@ref) # Example + ```jldoctes julia> y_true = [1, -1, 1, 1]; @@ -476,8 +489,8 @@ true ``` """ function squared_hinge_loss(ŷ, y; agg = mean) - _check_sizes(ŷ, y) - agg((max.(0, 1 .- ŷ .* y)) .^ 2) + _check_sizes(ŷ, y) + return agg((max.(0, 1 .- ŷ .* y)) .^ 2) end """ @@ -491,6 +504,7 @@ The dice coefficient is similar to the F1_score. Loss calculated as: 1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth) # Example + ```jldoctest julia> y_pred = [1.1, 2.1, 3.1]; @@ -502,8 +516,8 @@ julia> 1 - Flux.dice_coeff_loss(y_pred, 1:3) # ~ F1 score for image segmentatio ``` """ function dice_coeff_loss(ŷ, y; smooth = ofeltype(ŷ, 1.0)) - _check_sizes(ŷ, y) - 1 - (2 * sum(y .* ŷ) + smooth) / (sum(y .^ 2) + sum(ŷ .^ 2) + smooth) #TODO agg + _check_sizes(ŷ, y) + return 1 - (2 * sum(y .* ŷ) + smooth) / (sum(y .^ 2) + sum(ŷ .^ 2) + smooth) #TODO agg end """ @@ -515,14 +529,13 @@ Larger β weigh recall more than precision (by placing more emphasis on false ne Calculated as: 1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + (1 - β)*(1 .- y) .* ŷ + β*y .* (1 .- ŷ)) + 1) - """ function tversky_loss(ŷ, y; β = ofeltype(ŷ, 0.7)) _check_sizes(ŷ, y) #TODO add agg num = sum(y .* ŷ) + 1 den = sum(y .* ŷ + β * (1 .- y) .* ŷ + (1 - β) * y .* (1 .- ŷ)) + 1 - 1 - num / den + return 1 - num / den end """ @@ -536,15 +549,16 @@ For `γ == 0`, the loss is mathematically equivalent to [`Losses.binarycrossentr See also: [`Losses.focal_loss`](@ref) for multi-class setting # Example + ```jldoctest -julia> y = [0 1 0 - 1 0 1] +julia> y = [0 1 0 + 1 0 1] 2×3 Matrix{Int64}: 0 1 0 1 0 1 -julia> ŷ = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] +julia> ŷ = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] 2×3 Matrix{Float64}: 0.268941 0.5 0.268941 0.731059 0.5 0.731059 @@ -553,14 +567,14 @@ julia> Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385 true ``` """ -function binary_focal_loss(ŷ, y; agg=mean, γ=2, ϵ=epseltype(ŷ)) +function binary_focal_loss(ŷ, y; agg = mean, γ = 2, ϵ = epseltype(ŷ)) _check_sizes(ŷ, y) ŷ = ŷ .+ ϵ - p_t = y .* ŷ + (1 .- y) .* (1 .- ŷ) + p_t = y .* ŷ + (1 .- y) .* (1 .- ŷ) ce = -log.(p_t) weight = (1 .- p_t) .^ γ loss = weight .* ce - agg(loss) + return agg(loss) end """ @@ -575,16 +589,17 @@ The modulating factor, `γ`, controls the down-weighting strength. For `γ == 0`, the loss is mathematically equivalent to [`Losses.crossentropy`](@ref). # Example + ```jldoctest -julia> y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] +julia> y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] 3×5 Matrix{Int64}: 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 -julia> ŷ = softmax(reshape(-7:7, 3, 5) .* 1f0) +julia> ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0) 3×5 Matrix{Float32}: 0.0900306 0.0900306 0.0900306 0.0900306 0.0900306 0.244728 0.244728 0.244728 0.244728 0.244728 @@ -595,25 +610,25 @@ true ``` See also: [`Losses.binary_focal_loss`](@ref) for binary (not one-hot) labels - """ -function focal_loss(ŷ, y; dims=1, agg=mean, γ=2, ϵ=epseltype(ŷ)) +function focal_loss(ŷ, y; dims = 1, agg = mean, γ = 2, ϵ = epseltype(ŷ)) _check_sizes(ŷ, y) ŷ = ŷ .+ ϵ - agg(sum(@. -y * (1 - ŷ)^γ * log(ŷ); dims=dims)) + return agg(sum(@. -y * (1 - ŷ)^γ * log(ŷ); dims = dims)) end """ siamese_contrastive_loss(ŷ, y; margin = 1, agg = mean) - + Return the [contrastive loss](http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf) which can be useful for training Siamese Networks. It is given by - + agg(@. (1 - y) * ŷ^2 + y * max(0, margin - ŷ)^2) - + Specify `margin` to set the baseline for distance at which pairs are dissimilar. # Example + ```jldoctest julia> ŷ = [0.5, 1.5, 2.5]; diff --git a/src/losses/utils.jl b/src/losses/utils.jl index e42bdfbe2e..cda3e4a557 100644 --- a/src/losses/utils.jl +++ b/src/losses/utils.jl @@ -4,8 +4,8 @@ Return `x * log(x)` for `x ≥ 0`, handling `x == 0` by taking the limit from above, to get zero. """ function xlogx(x) - result = x * log(x) - ifelse(iszero(x), zero(result), result) + result = x * log(x) + return ifelse(iszero(x), zero(result), result) end """ @@ -14,24 +14,25 @@ end Return `x * log(y)` for `y > 0`, and zero when `x == 0`. """ function xlogy(x, y) - result = x * log(y) - ifelse(iszero(x), zero(result), result) + result = x * log(y) + return ifelse(iszero(x), zero(result), result) end @adjoint function broadcasted(::typeof(xlogy), x::Zygote.Numeric, y::Zygote.Numeric) - res = xlogy.(x, y) - res, Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), Zygote.unbroadcast(y, Δ .* x ./ y)) + res = xlogy.(x, y) + return res, + Δ -> (nothing, Zygote.unbroadcast(x, xlogy.(Δ, y)), + Zygote.unbroadcast(y, Δ .* x ./ y)) end -ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x/y) # should help Diffractor's broadcasting -ChainRulesCore.@scalar_rule xlogx(x) (log(y) + true) +ChainRulesCore.@scalar_rule xlogy(x, y) (log(y), x / y) # should help Diffractor's broadcasting +ChainRulesCore.@scalar_rule xlogx(x) (log(y)+true) function _check_sizes(ŷ::AbstractArray, y::AbstractArray) - for d in 1:max(ndims(ŷ), ndims(y)) - size(ŷ,d) == size(y,d) || throw(DimensionMismatch( - "loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))" - )) - end + for d in 1:max(ndims(ŷ), ndims(y)) + size(ŷ, d) == size(y, d) || + throw(DimensionMismatch("loss function expects size(ŷ) = $(size(ŷ)) to match size(y) = $(size(y))")) + end end _check_sizes(ŷ, y) = nothing # pass-through, for constant label e.g. y = 1 diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl index e691ce0170..fa78f513d8 100644 --- a/src/optimise/Optimise.jl +++ b/src/optimise/Optimise.jl @@ -4,10 +4,10 @@ using LinearAlgebra import ArrayInterface export train!, update!, - Descent, Adam, Momentum, Nesterov, RMSProp, - AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief, - InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser, - ClipValue, ClipNorm + Descent, Adam, Momentum, Nesterov, RMSProp, + AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief, + InvDecay, ExpDecay, WeightDecay, stop, skip, Optimiser, + ClipValue, ClipNorm include("optimisers.jl") include("train.jl") diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl index ce72a4b0ce..f4d9687384 100644 --- a/src/optimise/optimisers.jl +++ b/src/optimise/optimisers.jl @@ -14,10 +14,12 @@ Classic gradient descent optimiser with learning rate `η`. For each parameter `p` and its gradient `δp`, this runs `p -= η*δp` # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating - the weights. + + - Learning rate (`η`): Amount by which gradients are discounted before updating + the weights. # Examples + ```julia opt = Descent() @@ -26,20 +28,20 @@ opt = Descent(0.3) ps = Flux.params(model) gs = gradient(ps) do - loss(x, y) + return loss(x, y) end Flux.Optimise.update!(opt, ps, gs) ``` """ mutable struct Descent <: AbstractOptimiser - eta::Float64 + eta::Float64 end Descent() = Descent(0.1) function apply!(o::Descent, x, Δ) - Δ .*= o.eta + return Δ .*= o.eta end """ @@ -48,12 +50,14 @@ end Gradient descent optimizer with learning rate `η` and momentum `ρ`. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating - the weights. -- Momentum (`ρ`): Controls the acceleration of gradient descent in the - prominent direction, in effect damping oscillations. + + - Learning rate (`η`): Amount by which gradients are discounted before updating + the weights. + - Momentum (`ρ`): Controls the acceleration of gradient descent in the + prominent direction, in effect damping oscillations. # Examples + ```julia opt = Momentum() @@ -61,18 +65,18 @@ opt = Momentum(0.01, 0.99) ``` """ mutable struct Momentum <: AbstractOptimiser - eta::Float64 - rho::Float64 - velocity::IdDict + eta::Float64 + rho::Float64 + velocity::IdDict end Momentum(η = 0.01, ρ = 0.9) = Momentum(η, ρ, IdDict()) function apply!(o::Momentum, x, Δ) - η, ρ = o.eta, o.rho - v = get!(() -> zero(x), o.velocity, x)::typeof(x) - @. v = ρ * v - η * Δ - @. Δ = -v + η, ρ = o.eta, o.rho + v = get!(() -> zero(x), o.velocity, x)::typeof(x) + @. v = ρ * v - η * Δ + @. Δ = -v end """ @@ -81,12 +85,14 @@ end Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating - the weights. -- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the - prominent direction, in effect damping oscillations. + + - Learning rate (`η`): Amount by which gradients are discounted before updating + the weights. + - Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the + prominent direction, in effect damping oscillations. # Examples + ```julia opt = Nesterov() @@ -94,19 +100,19 @@ opt = Nesterov(0.003, 0.95) ``` """ mutable struct Nesterov <: AbstractOptimiser - eta::Float64 - rho::Float64 - velocity::IdDict + eta::Float64 + rho::Float64 + velocity::IdDict end Nesterov(η = 0.001, ρ = 0.9) = Nesterov(η, ρ, IdDict()) function apply!(o::Nesterov, x, Δ) - η, ρ = o.eta, o.rho - v = get!(() -> zero(x), o.velocity, x)::typeof(x) - d = @. ρ^2 * v - (1+ρ) * η * Δ - @. v = ρ*v - η*Δ - @. Δ = -d + η, ρ = o.eta, o.rho + v = get!(() -> zero(x), o.velocity, x)::typeof(x) + d = @. ρ^2 * v - (1 + ρ) * η * Δ + @. v = ρ * v - η * Δ + @. Δ = -d end """ @@ -131,19 +137,19 @@ opt = RMSProp(0.002, 0.95) ``` """ mutable struct RMSProp <: AbstractOptimiser - eta::Float64 - rho::Float64 - epsilon::Float64 - acc::IdDict + eta::Float64 + rho::Float64 + epsilon::Float64 + acc::IdDict end RMSProp(η::Real = 0.001, ρ::Real = 0.9, ϵ::Real = EPS) = RMSProp(η, ρ, ϵ, IdDict()) RMSProp(η::Real, ρ::Real, acc::IdDict) = RMSProp(η, ρ, EPS, acc) function apply!(o::RMSProp, x, Δ) - η, ρ = o.eta, o.rho - acc = get!(() -> zero(x), o.acc, x)::typeof(x) - @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) - @. Δ *= η / (√acc + o.epsilon) + η, ρ = o.eta, o.rho + acc = get!(() -> zero(x), o.acc, x)::typeof(x) + @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) + @. Δ *= η / (√acc + o.epsilon) end """ @@ -165,27 +171,28 @@ opt = Adam(0.001, (0.9, 0.8)) ``` """ mutable struct Adam <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64,Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end Adam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = Adam(η, β, ϵ, IdDict()) Adam(η::Real, β::Tuple, state::IdDict) = Adam(η, β, EPS, state) function apply!(o::Adam, x, Δ) - η, β = o.eta, o.beta + η, β = o.eta, o.beta - mt, vt, βp = get!(o.state, x) do - (zero(x), zero(x), Float64[β[1], β[2]]) - end :: Tuple{typeof(x),typeof(x),Vector{Float64}} + mt, vt, βp = get!(o.state, x) do + return (zero(x), zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η - βp .= βp .* β + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) + @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η + βp .= βp .* β - return Δ + return Δ end """ @@ -207,35 +214,37 @@ opt = RAdam(0.001, (0.9, 0.8)) ``` """ mutable struct RAdam <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64,Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end RAdam(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = RAdam(η, β, ϵ, IdDict()) RAdam(η::Real, β::Tuple, state::IdDict) = RAdam(η, β, EPS, state) function apply!(o::RAdam, x, Δ) - η, β = o.eta, o.beta - ρ∞ = 2/(1-β[2])-1 - - mt, vt, βp, t = get!(o.state, x) do - (zero(x), zero(x), Float64[β[1], β[2]], Ref(1)) - end :: Tuple{typeof(x),typeof(x),Vector{Float64},Base.RefValue{Int}} - - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2]) - if ρ > 4 - r = sqrt((ρ-4)*(ρ-2)*ρ∞/((ρ∞-4)*(ρ∞-2)*ρ)) - @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η * r - else - @. Δ = mt / (1 - βp[1]) * η - end - βp .= βp .* β - t[] += 1 + η, β = o.eta, o.beta + ρ∞ = 2 / (1 - β[2]) - 1 + + mt, vt, βp, t = get!(o.state, + x) do + return (zero(x), zero(x), Float64[β[1], β[2]], + Ref(1)) + end::Tuple{typeof(x), typeof(x), Vector{Float64}, Base.RefValue{Int}} + + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) + ρ = ρ∞ - 2t[] * βp[2] / (1 - βp[2]) + if ρ > 4 + r = sqrt((ρ - 4) * (ρ - 2) * ρ∞ / ((ρ∞ - 4) * (ρ∞ - 2) * ρ)) + @. Δ = mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) * η * r + else + @. Δ = mt / (1 - βp[1]) * η + end + βp .= βp .* β + t[] += 1 - return Δ + return Δ end """ @@ -257,27 +266,28 @@ opt = AdaMax(0.001, (0.9, 0.995)) ``` """ mutable struct AdaMax <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64,Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end AdaMax(η::Real = 0.001, β::Tuple = (0.9, 0.999), ϵ::Real = EPS) = AdaMax(η, β, ϵ, IdDict()) AdaMax(η::Real, β::Tuple, state::IdDict) = AdaMax(η, β, EPS, state) function apply!(o::AdaMax, x, Δ) - η, β = o.eta, o.beta + η, β = o.eta, o.beta - mt, ut, βp = get!(o.state, x) do - (zero(x), zero(x), Float64[β[1], β[2]]) - end :: Tuple{typeof(x),typeof(x),Vector{Float64}} + mt, ut, βp = get!(o.state, x) do + return (zero(x), zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. ut = max(β[2] * ut, abs(Δ)) - @. Δ = (η/(1 - βp[1])) * mt/(ut + o.epsilon) - βp .= βp .* β + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. ut = max(β[2] * ut, abs(Δ)) + @. Δ = (η / (1 - βp[1])) * mt / (ut + o.epsilon) + βp .= βp .* β - return Δ + return Δ end """ @@ -300,29 +310,31 @@ opt = OAdam(0.001, (0.9, 0.995)) ``` """ mutable struct OAdam <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64,Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end OAdam(η::Real = 0.001, β::Tuple = (0.5, 0.9), ϵ::Real = EPS) = OAdam(η, β, ϵ, IdDict()) OAdam(η::Real, β::Tuple, state::IdDict) = RMSProp(η, β, EPS, state) function apply!(o::OAdam, x, Δ) - η, β = o.eta, o.beta + η, β = o.eta, o.beta - mt, vt, Δ_, βp = get!(o.state, x) do - (zero(x), zero(x), zero(x), Float64[β[1], β[2]]) - end :: Tuple{typeof(x),typeof(x),typeof(x),Vector{Float64}} + mt, vt, Δ_, βp = get!(o.state, + x) do + return (zero(x), zero(x), zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), typeof(x), Vector{Float64}} - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - @. Δ = -Δ_ - @. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) - @. Δ += 2Δ_ - βp .= βp .* β + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) + @. Δ = -Δ_ + @. Δ_ = η * mt / (1 - βp[1]) / (√(vt / (1 - βp[2])) + o.epsilon) + @. Δ += 2Δ_ + βp .= βp .* β - return Δ + return Δ end """ @@ -344,18 +356,18 @@ opt = AdaGrad(0.001) ``` """ mutable struct AdaGrad <: AbstractOptimiser - eta::Float64 - epsilon::Float64 - acc::IdDict + eta::Float64 + epsilon::Float64 + acc::IdDict end AdaGrad(η::Real = 0.1, ϵ::Real = EPS) = AdaGrad(η, ϵ, IdDict()) AdaGrad(η::Real, state::IdDict) = AdaGrad(η, EPS, state) function apply!(o::AdaGrad, x, Δ) - η = o.eta - acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x) - @. acc += Δ * conj(Δ) - @. Δ *= η / (√acc + o.epsilon) + η = o.eta + acc = get!(() -> fill!(similar(x), o.epsilon), o.acc, x)::typeof(x) + @. acc += Δ * conj(Δ) + @. Δ *= η / (√acc + o.epsilon) end """ @@ -376,22 +388,22 @@ opt = AdaDelta(0.89) ``` """ mutable struct AdaDelta <: AbstractOptimiser - rho::Float64 - epsilon::Float64 - state::IdDict{Any, Any} + rho::Float64 + epsilon::Float64 + state::IdDict{Any, Any} end AdaDelta(ρ::Real = 0.9, ϵ::Real = EPS) = AdaDelta(ρ, ϵ, IdDict()) AdaDelta(ρ::Real, state::IdDict) = AdaDelta(ρ, EPS, state) function apply!(o::AdaDelta, x, Δ) - ρ = o.rho - acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2,typeof(x)} - @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) - # DON'T remove epsilon from numerator - # or even out of the square roots - @. Δ *= √(Δacc + o.epsilon) / √(acc + o.epsilon) - @. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ) - return Δ + ρ = o.rho + acc, Δacc = get!(() -> (zero(x), zero(x)), o.state, x)::NTuple{2, typeof(x)} + @. acc = ρ * acc + (1 - ρ) * Δ * conj(Δ) + # DON'T remove epsilon from numerator + # or even out of the square roots + @. Δ *= √(Δacc + o.epsilon) / √(acc + o.epsilon) + @. Δacc = ρ * Δacc + (1 - ρ) * Δ * conj(Δ) + return Δ end """ @@ -414,25 +426,26 @@ opt = AMSGrad(0.001, (0.89, 0.995)) ``` """ mutable struct AMSGrad <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64, Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end AMSGrad(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AMSGrad(η, β, ϵ, IdDict()) AMSGrad(η::Real, β::Tuple, state::IdDict) = AMSGrad(η, β, EPS, state) function apply!(o::AMSGrad, x, Δ) - η, β = o.eta, o.beta + η, β = o.eta, o.beta - mt, vt, v̂t = get!(o.state, x) do - (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon)) - end :: NTuple{3,typeof(x)} + mt, vt, v̂t = get!(o.state, x) do + return (fill!(similar(x), o.epsilon), fill!(similar(x), o.epsilon), + fill!(similar(x), o.epsilon)) + end::NTuple{3, typeof(x)} - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. vt = β[2] * vt + (1 - β[2]) * Δ ^ 2 - @. v̂t = max(v̂t, vt) - @. Δ = η * mt / (√v̂t + o.epsilon) + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ^2 + @. v̂t = max(v̂t, vt) + @. Δ = η * mt / (√v̂t + o.epsilon) end """ @@ -455,28 +468,30 @@ opt = NAdam(0.002, (0.89, 0.995)) ``` """ mutable struct NAdam <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64, Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end NAdam(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = NAdam(η, β, ϵ, IdDict()) NAdam(η::Real, β::Tuple, state::IdDict) = NAdam(η, β, EPS, state) function apply!(o::NAdam, x, Δ) - η, β = o.eta, o.beta + η, β = o.eta, o.beta - mt, vt, βp = get!(o.state, x) do - (zero(x), zero(x), Float64[o.beta[1], o.beta[2]]) - end :: Tuple{typeof(x),typeof(x),Vector{Float64}} - β1p, β2p = βp + mt, vt, βp = get!(o.state, x) do + return (zero(x), zero(x), + Float64[o.beta[1], o.beta[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} + β1p, β2p = βp - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) - @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η - βp .= βp .* β + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. vt = β[2] * vt + (1 - β[2]) * Δ * conj(Δ) + @. Δ = (β[1] * mt / (1 - β[1] * β1p) + (1 - β[1]) * Δ / (1 - β1p)) / + (√(vt * β[2] / (1 - β2p)) + o.epsilon) * η + βp .= βp .* β - return Δ + return Δ end """ @@ -486,21 +501,22 @@ end weight decay regularization. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating - the weights. -- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the - second (β2) momentum estimate. -- `decay`: Decay applied to weights during optimisation. + + - Learning rate (`η`): Amount by which gradients are discounted before updating + the weights. + - Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the + second (β2) momentum estimate. + - `decay`: Decay applied to weights during optimisation. # Examples + ```julia opt = AdamW() opt = AdamW(0.001, (0.89, 0.995), 0.1) ``` """ -AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) = - Optimiser(Adam(η, β), WeightDecay(decay)) +AdamW(η = 0.001, β = (0.9, 0.999), decay = 0) = Optimiser(Adam(η, β), WeightDecay(decay)) """ AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ = $EPS) @@ -522,40 +538,40 @@ opt = AdaBelief(0.001, (0.9, 0.8)) ``` """ mutable struct AdaBelief <: AbstractOptimiser - eta::Float64 - beta::Tuple{Float64,Float64} - epsilon::Float64 - state::IdDict{Any, Any} + eta::Float64 + beta::Tuple{Float64, Float64} + epsilon::Float64 + state::IdDict{Any, Any} end AdaBelief(η::Real = 0.001, β = (0.9, 0.999), ϵ::Real = EPS) = AdaBelief(η, β, ϵ, IdDict()) AdaBelief(η::Real, β::Tuple, state::IdDict) = AdaBelief(η, β, EPS, state) function apply!(o::AdaBelief, x, Δ) - η, β = o.eta, o.beta - - mt, st, βp = get!(o.state, x) do - (zero(x), zero(x), Float64[β[1], β[2]]) - end :: Tuple{typeof(x), typeof(x), Vector{Float64}} + η, β = o.eta, o.beta + + mt, st, βp = get!(o.state, x) do + return (zero(x), zero(x), + Float64[β[1], β[2]]) + end::Tuple{typeof(x), typeof(x), Vector{Float64}} + + #= st is a variance and can go to zero. This is in contrast to Adam, which uses the + second moment which is usually far enough from zero. This is problematic, since st + can be slightly negative due to numerical error, and the square root below will fail. + Also, if we want to differentiate through the optimizer, √0 is not differentiable. + To protect against this, we add a small number, st -> st + eps2. + The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer) + uses the square of Adam's epsilon, which we do here. + See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =# + eps2 = o.epsilon^2 # TODO: make epsilon^2 the default in next breaking release + + @. mt = β[1] * mt + (1 - β[1]) * Δ + @. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) + eps2 + @. Δ = η * mt / (1 - βp[1]) / (√(st / (1 - βp[2])) + eps2) + βp .= βp .* β - #= st is a variance and can go to zero. This is in contrast to Adam, which uses the - second moment which is usually far enough from zero. This is problematic, since st - can be slightly negative due to numerical error, and the square root below will fail. - Also, if we want to differentiate through the optimizer, √0 is not differentiable. - To protect against this, we add a small number, st -> st + eps2. - The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer) - uses the square of Adam's epsilon, which we do here. - See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =# - eps2 = o.epsilon^2 # TODO: make epsilon^2 the default in next breaking release - - @. mt = β[1] * mt + (1 - β[1]) * Δ - @. st = β[2] * st + (1 - β[2]) * (Δ - mt) * conj(Δ - mt) + eps2 - @. Δ = η * mt / (1 - βp[1]) / (√(st / (1 - βp[2])) + eps2) - βp .= βp .* β - - return Δ + return Δ end - # Compose optimizers """ @@ -566,21 +582,22 @@ that will be fed into the next, and this is finally applied to the parameter as usual. """ mutable struct Optimiser <: AbstractOptimiser - os::Vector{Any} + os::Vector{Any} end Optimiser(opts::AbstractOptimiser...) = Optimiser(Any[opts...]) -@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, Base.setindex! +@forward Optimiser.os Base.getindex, Base.first, Base.last, Base.lastindex, Base.push!, + Base.setindex! @forward Optimiser.os Base.iterate Base.getindex(c::Optimiser, i::AbstractArray) = Optimiser(c.os[i]...) function apply!(o::Optimiser, x, Δ) - for opt in o.os - Δ = apply!(opt, x, Δ) - end - return Δ + for opt in o.os + Δ = apply!(opt, x, Δ) + end + return Δ end """ @@ -595,28 +612,28 @@ for more general scheduling techniques. # Examples -`InvDecay` is typically composed with other optimizers +`InvDecay` is typically composed with other optimizers as the last transformation of the gradient: ```julia # Inverse decay of the learning rate # with starting value 0.001 and decay coefficient 0.01. -opt = Optimiser(Adam(1f-3), InvDecay(1f-2)) +opt = Optimiser(Adam(1.0f-3), InvDecay(1.0f-2)) ``` """ mutable struct InvDecay <: AbstractOptimiser - gamma::Float64 - state::IdDict{Any, Int} + gamma::Float64 + state::IdDict{Any, Int} end InvDecay(γ = 0.001) = InvDecay(γ, IdDict{Any, Int}()) function apply!(o::InvDecay, x, Δ) - γ = o.gamma - n = get!(o.state, x, 1) - Δ .*= 1 / (1 + γ * n) - o.state[x] = n + 1 - return Δ + γ = o.gamma + n = get!(o.state, x, 1) + Δ .*= 1 / (1 + γ * n) + o.state[x] = n + 1 + return Δ end """ @@ -626,73 +643,77 @@ Discount the learning rate `η` by the factor `decay` every `decay_step` steps t a minimum of `clip`. # Parameters -- Learning rate (`η`): Amount by which gradients are discounted before updating - the weights. -- `decay`: Factor by which the learning rate is discounted. -- `decay_step`: Schedule decay operations by setting the number of steps between - two decay operations. -- `clip`: Minimum value of learning rate. -- 'start': Step at which the decay starts. + - Learning rate (`η`): Amount by which gradients are discounted before updating + the weights. + - `decay`: Factor by which the learning rate is discounted. + - `decay_step`: Schedule decay operations by setting the number of steps between + two decay operations. + - `clip`: Minimum value of learning rate. + - 'start': Step at which the decay starts. See also the [Scheduling Optimisers](@ref) section of the docs for more general scheduling techniques. # Examples -`ExpDecay` is typically composed with other optimizers +`ExpDecay` is typically composed with other optimizers as the last transformation of the gradient: + ```julia opt = Optimiser(Adam(), ExpDecay(1.0)) ``` + Note: you may want to start with `η=1` in `ExpDecay` when combined with other optimizers (`Adam` in this case) that have their own learning rate. """ mutable struct ExpDecay <: AbstractOptimiser - eta::Float64 - decay::Float64 - step::Int64 - clip::Float64 - start::Int64 - current::IdDict + eta::Float64 + decay::Float64 + step::Int64 + clip::Float64 + start::Int64 + current::IdDict end -ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 0) = - ExpDecay(opt, decay, decay_step, clip, start, IdDict()) +function ExpDecay(opt = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 0) + return ExpDecay(opt, decay, decay_step, clip, start, IdDict()) +end function apply!(o::ExpDecay, x, Δ) - η, s, decay, start = o.eta, o.step, o.decay, o.start - n = o.current[x] = get(o.current, x, 0) + 1 - if n > start && n % s == 0 && count(x -> x > start && x % s == 0, values(o.current)) == 1 - η = max(η * decay, o.clip) - o.eta = η - end - @. Δ *= η + η, s, decay, start = o.eta, o.step, o.decay, o.start + n = o.current[x] = get(o.current, x, 0) + 1 + if n > start && n % s == 0 && + count(x -> x > start && x % s == 0, values(o.current)) == 1 + η = max(η * decay, o.clip) + o.eta = η + end + @. Δ *= η end """ WeightDecay(λ = 0) -Decay weights by ``λ``. +Decay weights by ``λ``. Typically composed with other optimizers as the first transformation to the gradient, -making it equivalent to adding ``L_2`` regularization +making it equivalent to adding ``L_2`` regularization with coefficient ``λ`` to the loss. # Examples ```julia -opt = Optimiser(WeightDecay(1f-4), Adam()) +opt = Optimiser(WeightDecay(1.0f-4), Adam()) ``` """ mutable struct WeightDecay <: AbstractOptimiser - wd::Real + wd::Real end WeightDecay() = WeightDecay(0) function apply!(o::WeightDecay, x, Δ) - wd = o.wd - @. Δ += wd * x + wd = o.wd + @. Δ += wd * x end """ diff --git a/src/optimise/train.jl b/src/optimise/train.jl index b6d6986285..f9c6609b57 100644 --- a/src/optimise/train.jl +++ b/src/optimise/train.jl @@ -1,7 +1,6 @@ using ProgressLogging: @progress, @withprogress, @logprogress import Zygote: Params, gradient - """ update!(opt, p, g) update!(opt, ps::Params, gs) @@ -13,16 +12,16 @@ As a result, the parameters are mutated and the optimizer's internal state may c The gradient could be mutated as well. """ function update!(opt::AbstractOptimiser, x, x̄) - x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's - # output are not mutable, see #1510 - x .-= apply!(opt, x, x̄r) + x̄r = ArrayInterface.restructure(x, x̄) # address some cases where Zygote's + # output are not mutable, see #1510 + return x .-= apply!(opt, x, x̄r) end function update!(opt::AbstractOptimiser, xs::Params, gs) - for x in xs - isnothing(gs[x]) && continue - update!(opt, x, gs[x]) - end + for x in xs + isnothing(gs[x]) && continue + update!(opt, x, gs[x]) + end end # Callback niceties @@ -39,22 +38,24 @@ Call `Flux.skip()` in a callback to indicate when a callback condition is met. This will trigger the train loop to skip the current data point and not update with the calculated gradient. !!! note + `Flux.skip()` will be removed from Flux 0.14 # Examples + ```julia cb = function () - loss() > 1e7 && Flux.skip() + return loss() > 1e7 && Flux.skip() end ``` """ function skip() - Base.depwarn("""Flux.skip() will be removed from Flux 0.14. - and should be replaced with `continue` in an ordinary `for` loop.""", :skip) - throw(SkipException()) + Base.depwarn("""Flux.skip() will be removed from Flux 0.14. + and should be replaced with `continue` in an ordinary `for` loop.""", + :skip) + throw(SkipException()) end - struct StopException <: Exception end """ @@ -64,19 +65,21 @@ Call `Flux.stop()` in a callback to indicate when a callback condition is met. This will trigger the train loop to stop and exit. !!! note + `Flux.stop()` will be removed from Flux 0.14. It should be replaced with `break` in an ordinary `for` loop. # Examples + ```julia cb = function () - accuracy() > 0.9 && Flux.stop() + return accuracy() > 0.9 && Flux.stop() end ``` """ function stop() - Base.depwarn("""Flux.stop() will be removed from Flux 0.14. - It should be replaced with `break` in an ordinary `for` loop.""", :stop) - throw(StopException()) + Base.depwarn("""Flux.stop() will be removed from Flux 0.14. + It should be replaced with `break` in an ordinary `for` loop.""", :stop) + throw(StopException()) end batchmemaybe(x) = tuple(x) @@ -84,64 +87,69 @@ batchmemaybe(x::Tuple) = x """ train!(loss, pars::Params, data, opt::AbstractOptimiser; [cb]) - -Uses a `loss` function and training `data` to improve the + +Uses a `loss` function and training `data` to improve the model's parameters according to a particular optimisation rule `opt`. For each `d in data`, first the gradient of the `loss` is computed like this: + ``` gradient(() -> loss(d...), pars) # if d isa Tuple gradient(() -> loss(d), pars) # otherwise ``` + Here `pars` is produced by calling [`Flux.params`](@ref) on your model. (Or just on the layers you want to train, like `train!(loss, params(model[1:end-2]), data, opt)`.) This is the "implicit" style of parameter handling. This gradient is then used by optimizer `opt` to update the parameters: + ``` update!(opt, pars, grads) ``` + The optimiser should be from the `Flux.Optimise` module (see [Optimisers](@ref)). Different optimisers can be combined using [`Flux.Optimise.Optimiser`](@ref Flux.Optimiser). This training loop iterates through `data` once. -You can use [`@epochs`](@ref) to do this several times, or +You can use [`@epochs`](@ref) to do this several times, or use for instance `Iterators.repeat` to make a longer `data` iterator. ## Callbacks [Callbacks](@ref) are given with the keyword argument `cb`. For example, this will print "training" every 10 seconds (using [`Flux.throttle`](@ref)): + ``` train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10)) ``` - + The callback can call [`Flux.stop`](@ref) to interrupt the training loop. Multiple callbacks can be passed to `cb` as array. """ function train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ()) - cb = runall(cb) - itrsz = Base.IteratorSize(typeof(data)) - n = (itrsz == Base.HasLength()) || (itrsz == Base.HasShape{1}()) ? length(data) : 0 - @withprogress for (i, d) in enumerate(data) - try - gs = gradient(ps) do - loss(batchmemaybe(d)...) - end - update!(opt, ps, gs) - cb() - catch ex - if ex isa StopException - break - elseif ex isa SkipException - continue - else - rethrow(ex) - end + cb = runall(cb) + itrsz = Base.IteratorSize(typeof(data)) + n = (itrsz == Base.HasLength()) || (itrsz == Base.HasShape{1}()) ? length(data) : 0 + @withprogress for (i, d) in enumerate(data) + try + gs = gradient(ps) do + return loss(batchmemaybe(d)...) + end + update!(opt, ps, gs) + cb() + catch ex + if ex isa StopException + break + elseif ex isa SkipException + continue + else + rethrow(ex) + end + end + @logprogress iszero(n) ? nothing : i / n end - @logprogress iszero(n) ? nothing : i / n - end end """ @@ -151,9 +159,11 @@ Run `body` `N` times. Mainly useful for quickly doing multiple epochs of training in a REPL. !!! note + The macro `@epochs` will be removed from Flux 0.14. Please just write an ordinary `for` loop. # Examples + ```julia julia> Flux.@epochs 2 println("hello") [ Info: Epoch 1 @@ -163,10 +173,11 @@ hello ``` """ macro epochs(n, ex) - Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14. - As an alternative, you can write a simple `for i in 1:epochs` loop.""", Symbol("@epochs"), force=true) - :(@progress for i = 1:$(esc(n)) - @info "Epoch $i" - $(esc(ex)) - end) + Base.depwarn("""The macro `@epochs` will be removed from Flux 0.14. + As an alternative, you can write a simple `for i in 1:epochs` loop.""", + Symbol("@epochs"), force = true) + return :(@progress for i in 1:($(esc(n))) + @info "Epoch $i" + $(esc(ex)) + end) end diff --git a/src/outputsize.jl b/src/outputsize.jl index 76c58237be..34e20a930c 100644 --- a/src/outputsize.jl +++ b/src/outputsize.jl @@ -14,21 +14,21 @@ struct Nil <: Real end @doc @doc(Nil) const nil = Nil() -Nil(::T) where T<:Number = nil -(::Type{T})(::Nil) where T<:Number = nil +Nil(::T) where {T <: Number} = nil +(::Type{T})(::Nil) where {T <: Number} = nil Base.convert(::Type{Nil}, ::Number) = nil Base.float(::Type{Nil}) = Nil for f in [:copy, :zero, :one, :oneunit, - :+, :-, :abs, :abs2, :inv, - :exp, :log, :log1p, :log2, :log10, - :sqrt, :tanh, :conj] - @eval Base.$f(::Nil) = nil + :+, :-, :abs, :abs2, :inv, + :exp, :log, :log1p, :log2, :log10, + :sqrt, :tanh, :conj] + @eval Base.$f(::Nil) = nil end for f in [:+, :-, :*, :/, :^, :mod, :div, :rem] - @eval Base.$f(::Nil, ::Nil) = nil + @eval Base.$f(::Nil, ::Nil) = nil end Base.:<(::Nil, ::Nil) = true @@ -62,10 +62,11 @@ which should work out of the box for custom layers. If `m` is a `Tuple` or `Vector`, its elements are applied in sequence, like `Chain(m...)`. # Examples + ```julia-repl julia> using Flux: outputsize -julia> outputsize(Dense(10 => 4), (10,); padbatch=true) +julia> outputsize(Dense(10 => 4), (10,); padbatch = true) (4, 1) julia> m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32)); @@ -73,13 +74,17 @@ julia> m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32)); julia> m(randn(Float32, 10, 10, 3, 64)) |> size (6, 6, 32, 64) -julia> outputsize(m, (10, 10, 3); padbatch=true) +julia> outputsize(m, (10, 10, 3); padbatch = true) (6, 6, 32, 1) julia> outputsize(m, (10, 10, 3, 64)) (6, 6, 32, 64) -julia> try outputsize(m, (10, 10, 7, 64)) catch e println(e) end +julia> try + outputsize(m, (10, 10, 7, 64)) + catch e + println(e) + end ┌ Error: layer Conv((3, 3), 3=>16), index 1 in Chain, gave an error with input of size (10, 10, 7, 64) └ @ Flux ~/.julia/dev/Flux/src/outputsize.jl:114 DimensionMismatch("Input channels must match! (7 vs. 3)") @@ -88,27 +93,27 @@ julia> outputsize([Dense(10 => 4), Dense(4 => 2)], (10, 1)) # Vector of layers b (2, 1) ``` """ -function outputsize(m, inputsizes::Tuple...; padbatch=false) - x = nil_input(padbatch, inputsizes...) - return size(m(x)) +function outputsize(m, inputsizes::Tuple...; padbatch = false) + x = nil_input(padbatch, inputsizes...) + return size(m(x)) end -nil_input(pad::Bool, s::Tuple{Vararg{Integer}}) = pad ? fill(nil, (s...,1)) : fill(nil, s) +nil_input(pad::Bool, s::Tuple{Vararg{Integer}}) = pad ? fill(nil, (s..., 1)) : fill(nil, s) nil_input(pad::Bool, multi::Tuple{Vararg{Integer}}...) = nil_input.(pad, multi) nil_input(pad::Bool, tup::Tuple{Vararg{Tuple}}) = nil_input(pad, tup...) -function outputsize(m::Chain, inputsizes::Tuple{Vararg{Integer}}...; padbatch=false) - x = nil_input(padbatch, inputsizes...) - for (i,lay) in enumerate(m.layers) - try - x = lay(x) - catch err - str = x isa AbstractArray ? "with input of size $(size(x))" : "" - @error "layer $lay, index $i in Chain, gave an error $str" - rethrow(err) +function outputsize(m::Chain, inputsizes::Tuple{Vararg{Integer}}...; padbatch = false) + x = nil_input(padbatch, inputsizes...) + for (i, lay) in enumerate(m.layers) + try + x = lay(x) + catch err + str = x isa AbstractArray ? "with input of size $(size(x))" : "" + @error "layer $lay, index $i in Chain, gave an error $str" + rethrow(err) + end end - end - return size(x) + return size(x) end """ @@ -118,6 +123,7 @@ For model or layer `m` accepting multiple arrays as input, this returns `size(m((x, y, ...)))` given `size_x = size(x)`, etc. # Examples + ```jldoctest julia> x, y = rand(Float32, 5, 64), rand(Float32, 7, 64); @@ -128,12 +134,13 @@ julia> Flux.outputsize(par, (5, 64), (7, 64)) julia> m = Chain(par, Dense(20 => 13), softmax); -julia> Flux.outputsize(m, (5,), (7,); padbatch=true) +julia> Flux.outputsize(m, (5,), (7,); padbatch = true) (13, 1) julia> par(x, y) == par((x, y)) == Chain(par, identity)((x, y)) true ``` + Notice that `Chain` only accepts multiple arrays as a tuple, while `Parallel` also accepts them as multiple arguments; `outputsize` always supplies the tuple. @@ -142,38 +149,43 @@ outputsize ## make tuples and vectors be like Chains -outputsize(m::Tuple, input::Tuple...; padbatch=false) = outputsize(Chain(m...), input...; padbatch=padbatch) -outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chain(m...), input...; padbatch=padbatch) +function outputsize(m::Tuple, input::Tuple...; padbatch = false) + return outputsize(Chain(m...), input...; padbatch = padbatch) +end +function outputsize(m::AbstractVector, input::Tuple...; padbatch = false) + return outputsize(Chain(m...), input...; padbatch = padbatch) +end ## bypass statistics in normalization layers for layer in (:BatchNorm, :InstanceNorm, :GroupNorm) # LayerNorm works fine - @eval function (l::$layer)(x::AbstractArray{Nil}) - l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch( - string($layer, " expected ", l.chs, " channels, but got size(x) == ", size(x)))) - x - end + @eval function (l::$layer)(x::AbstractArray{Nil}) + l.chs == size(x, ndims(x) - 1) || + throw(DimensionMismatch(string($layer, " expected ", l.chs, + " channels, but got size(x) == ", size(x)))) + return x + end end ## fixes for layers that don't work out of the box for (fn, Dims) in ((:conv, DenseConvDims),) - @eval begin - function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims) - fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims), size(a)[end]) - end - - function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims) - NNlib.$fn(fill(nil, size(a)), b, dims) - end - - function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{<:Real}, dims::$Dims) - NNlib.$fn(a, fill(nil, size(b)), dims) + @eval begin + function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{Nil}, dims::$Dims) + return fill(nil, NNlib.output_size(dims)..., NNlib.channels_out(dims), + size(a)[end]) + end + + function NNlib.$fn(a::AbstractArray{<:Real}, b::AbstractArray{Nil}, dims::$Dims) + return NNlib.$fn(fill(nil, size(a)), b, dims) + end + + function NNlib.$fn(a::AbstractArray{Nil}, b::AbstractArray{<:Real}, dims::$Dims) + return NNlib.$fn(a, fill(nil, size(b)), dims) + end end - end end - """ @autosize (size...,) Chain(Layer(_ => 2), Layer(_), ...) @@ -187,6 +199,7 @@ The underscore may appear as an argument of a layer, or inside a `=>`. It may be used in further calculations, such as `Dense(_ => _÷4)`. # Examples + ``` julia> @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false)) Chain( @@ -226,45 +239,49 @@ julia> outputsize(ans, (28, 28, 1, 32)) ``` Limitations: -* While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail. -* While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)` - will fail if `size(x,1) != size(x,2)`. -* RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue. + + - While `@autosize (5, 32) Flux.Bilinear(_ => 7)` is OK, something like `Bilinear((_, _) => 7)` will fail. + - While `Scale(_)` and `LayerNorm(_)` are fine (and use the first dimension), `Scale(_,_)` and `LayerNorm(_,_)` + will fail if `size(x,1) != size(x,2)`. + - RNNs won't work: `@autosize (7, 11) LSTM(_ => 5)` fails, because `outputsize(RNN(3=>7), (3,))` also fails, a known issue. """ macro autosize(size, model) - Meta.isexpr(size, :tuple) || error("@autosize's first argument must be a tuple, the size of the input") - Meta.isexpr(model, :call) || error("@autosize's second argument must be something like Chain(layers...)") - ex = _makelazy(model) - @gensym m - quote - $m = $ex - $outputsize($m, $size) - $striplazy($m) - end |> esc + Meta.isexpr(size, :tuple) || + error("@autosize's first argument must be a tuple, the size of the input") + Meta.isexpr(model, :call) || + error("@autosize's second argument must be something like Chain(layers...)") + ex = _makelazy(model) + @gensym m + return quote + $m = $ex + $outputsize($m, $size) + $striplazy($m) + end |> esc end function _makelazy(ex::Expr) - n = _underscoredepth(ex) - n == 0 && return ex - n == 1 && error("@autosize doesn't expect an underscore here: $ex") - n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing)) - n > 2 && return Expr(ex.head, ex.args[1], map(_makelazy, ex.args[2:end])...) + n = _underscoredepth(ex) + n == 0 && return ex + n == 1 && error("@autosize doesn't expect an underscore here: $ex") + n == 2 && return :($LazyLayer($(string(ex)), $(_makefun(ex)), nothing)) + n > 2 && return Expr(ex.head, ex.args[1], map(_makelazy, ex.args[2:end])...) end _makelazy(x) = x function _underscoredepth(ex::Expr) - # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10 - ex.head in (:call, :kw, :(->), :block) || return 0 - ex.args[1] === :(=>) && ex.args[2] === :_ && return 1 - m = maximum(_underscoredepth, ex.args) - m == 0 ? 0 : m+1 + # Meta.isexpr(ex, :tuple) && :_ in ex.args && return 10 + ex.head in (:call, :kw, :(->), :block) || return 0 + ex.args[1] === :(=>) && ex.args[2] === :_ && return 1 + m = maximum(_underscoredepth, ex.args) + return m == 0 ? 0 : m + 1 end _underscoredepth(ex) = Int(ex === :_) function _makefun(ex) - T = Meta.isexpr(ex, :call) ? ex.args[1] : Type - @gensym x s - Expr(:(->), x, Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s))) + T = Meta.isexpr(ex, :call) ? ex.args[1] : Type + @gensym x s + return Expr(:(->), x, + Expr(:block, :($s = $autosizefor($T, $x)), _replaceunderscore(ex, s))) end """ @@ -274,61 +291,70 @@ If an `_` in your layer's constructor, used within `@autosize`, should *not* mean the 2nd-last dimension, then you can overload this. For instance `autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1)` -is needed to make `@autosize (2,3,4) Dense(_ => 5)` return +is needed to make `@autosize (2,3,4) Dense(_ => 5)` return `Dense(2 => 5)` rather than `Dense(3 => 5)`. """ -autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x)-1)) +autosizefor(::Type, x::AbstractArray) = size(x, max(1, ndims(x) - 1)) autosizefor(::Type{<:Dense}, x::AbstractArray) = size(x, 1) autosizefor(::Type{<:LayerNorm}, x::AbstractArray) = size(x, 1) _replaceunderscore(e, s) = e === :_ ? s : e -_replaceunderscore(ex::Expr, s) = Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...) +function _replaceunderscore(ex::Expr, s) + return Expr(ex.head, map(a -> _replaceunderscore(a, s), ex.args)...) +end mutable struct LazyLayer - str::String - make::Function - layer + str::String + make::Function + layer::Any end function (l::LazyLayer)(x::AbstractArray, ys::AbstractArray...) - l.layer === nothing || return l.layer(x, ys...) - made = l.make(x) # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later. - y = made(x, ys...) - l.layer = made # mutate after we know that call worked - return y + l.layer === nothing || return l.layer(x, ys...) + made = l.make(x) # for something like `Bilinear((_,__) => 7)`, perhaps need `make(xy...)`, later. + y = made(x, ys...) + l.layer = made # mutate after we know that call worked + return y end function striplazy(m) - fs, re = functor(m) - re(map(striplazy, fs)) + fs, re = functor(m) + return re(map(striplazy, fs)) end function striplazy(l::LazyLayer) - l.layer === nothing || return l.layer - error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy") + l.layer === nothing || return l.layer + return error("LazyLayer should be initialised, e.g. by outputsize(model, size), before using stiplazy") end # Could make LazyLayer usable outside of @autosize, for instance allow Chain(@lazy Dense(_ => 2))? # But then it will survive to produce weird structural gradients etc. function ChainRulesCore.rrule(l::LazyLayer, x) - l(x), _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.") + return l(x), + _ -> error("LazyLayer should never be used within a gradient. Call striplazy(model) first to remove all.") end function ChainRulesCore.rrule(::typeof(striplazy), m) - striplazy(m), _ -> error("striplazy should never be used within a gradient") + return striplazy(m), _ -> error("striplazy should never be used within a gradient") end -params!(p::Params, x::LazyLayer, seen = IdSet()) = error("LazyLayer should never be used within params(m). Call striplazy(m) first.") +function params!(p::Params, x::LazyLayer, seen = IdSet()) + return error("LazyLayer should never be used within params(m). Call striplazy(m) first.") +end -Functors.functor(::Type{<:LazyLayer}, x) = error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.") +function Functors.functor(::Type{<:LazyLayer}, x) + return error("LazyLayer should not be walked with Functors.jl, as the arrays which Flux.gpu wants to move may not exist yet.") +end function Base.show(io::IO, l::LazyLayer) - printstyled(io, "LazyLayer(", color=:light_black) - if l.layer == nothing - printstyled(io, l.str, color=:magenta) - else - printstyled(io, l.layer, color=:cyan) - end - printstyled(io, ")", color=:light_black) + printstyled(io, "LazyLayer(", color = :light_black) + if l.layer == nothing + printstyled(io, l.str, color = :magenta) + else + printstyled(io, l.layer, color = :cyan) + end + return printstyled(io, ")", color = :light_black) end -_big_show(io::IO, l::LazyLayer, indent::Int=0, name=nothing) = _layer_show(io, l, indent, name) +function _big_show(io::IO, l::LazyLayer, indent::Int = 0, name = nothing) + return _layer_show(io, l, indent, name) +end diff --git a/src/utils.jl b/src/utils.jl index 10ea2982b5..a7600ab709 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -18,7 +18,7 @@ julia> layer = Dense(10, 20); julia> Flux.nfan(size(layer.weight)) (10, 20) -julia> layer = Conv((3, 3), 2=>10); +julia> layer = Conv((3, 3), 2 => 10); julia> Flux.nfan(size(layer.weight)) (18, 90) @@ -28,7 +28,7 @@ nfan() = 1, 1 # fan_in, fan_out nfan(n) = 1, n # A vector is treated as a n×1 matrix nfan(n_out, n_in) = n_in, n_out # In case of Dense kernels: arranged as matrices nfan(dims::Tuple) = nfan(dims...) -nfan(dims...) = prod(dims[1:end-2]) .* (dims[end-1], dims[end]) # In case of convolution kernels +nfan(dims...) = prod(dims[1:(end - 2)]) .* (dims[end - 1], dims[end]) # In case of convolution kernels ofeltype(x, y) = convert(float(eltype(x)), y) epseltype(x) = eps(float(eltype(x))) @@ -38,10 +38,13 @@ epseltype(x) = eps(float(eltype(x))) Create an instance of the RNG most appropriate for `x`. The current defaults are: -- `x isa CuArray`: `CUDA.default_rng()`, else: -- `x isa AbstractArray`, or no `x` provided: - - Julia version is < 1.7: `Random.GLOBAL_RNG` - - Julia version is >= 1.7: `Random.default_rng()` + + - `x isa CuArray`: `CUDA.default_rng()`, else: + + - `x isa AbstractArray`, or no `x` provided: + + + Julia version is < 1.7: `Random.GLOBAL_RNG` + + Julia version is >= 1.7: `Random.default_rng()` """ rng_from_array(::AbstractArray) = default_rng_value() rng_from_array(::CuArray) = CUDA.default_rng() @@ -49,16 +52,16 @@ rng_from_array(::CuArray) = CUDA.default_rng() @non_differentiable rng_from_array(::Any) if VERSION >= v"1.7" - @doc """ - default_rng_value() - - Create an instance of the default RNG depending on Julia's version. - - Julia version is < 1.7: `Random.GLOBAL_RNG` - - Julia version is >= 1.7: `Random.default_rng()` - """ - default_rng_value() = Random.default_rng() + @doc """ + default_rng_value() + + Create an instance of the default RNG depending on Julia's version. + - Julia version is < 1.7: `Random.GLOBAL_RNG` + - Julia version is >= 1.7: `Random.default_rng()` + """ + default_rng_value() = Random.default_rng() else - default_rng_value() = Random.GLOBAL_RNG + default_rng_value() = Random.GLOBAL_RNG end """ @@ -71,17 +74,18 @@ distribution on the interval ``[-x, x]``, where `x = gain * sqrt(6 / (fan_in + f This method is described in [1] and also known as Xavier initialization. # Examples + ```jldoctest; setup = :(using Random; Random.seed!(0)) julia> Flux.glorot_uniform(3, 4) |> summary "3×4 Matrix{Float32}" -julia> round.(extrema(Flux.glorot_uniform(10, 100)), digits=3) +julia> round.(extrema(Flux.glorot_uniform(10, 100)), digits = 3) (-0.232f0, 0.234f0) -julia> round.(extrema(Flux.glorot_uniform(100, 10)), digits=3) +julia> round.(extrema(Flux.glorot_uniform(100, 10)), digits = 3) (-0.233f0, 0.233f0) -julia> round.(extrema(Flux.glorot_uniform(100, 100)), digits=3) +julia> round.(extrema(Flux.glorot_uniform(100, 100)), digits = 3) (-0.173f0, 0.173f0) julia> Dense(3 => 2, tanh; init = Flux.glorot_uniform(MersenneTwister(1))) @@ -97,12 +101,16 @@ julia> ans.bias [1] Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." _Proceedings of the thirteenth international conference on artificial intelligence and statistics_. 2010. """ -function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real=1) - scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...))) - (rand(rng, Float32, dims...) .- 0.5f0) .* scale +function glorot_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = 1) + scale = Float32(gain) * sqrt(24.0f0 / sum(nfan(dims...))) + return (rand(rng, Float32, dims...) .- 0.5f0) .* scale +end +function glorot_uniform(dims::Integer...; kw...) + return glorot_uniform(default_rng_value(), dims...; kw...) +end +function glorot_uniform(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...) end -glorot_uniform(dims::Integer...; kw...) = glorot_uniform(default_rng_value(), dims...; kw...) -glorot_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_uniform(rng, dims...; init_kwargs..., kwargs...) ChainRulesCore.@non_differentiable glorot_uniform(::Any...) @@ -117,22 +125,23 @@ using [`nfan`](@ref Flux.nfan). This method is described in [1] and also known as Xavier initialization. # Examples + ```jldoctest; setup = :(using Random; Random.seed!(0)) julia> using Statistics -julia> round(std(Flux.glorot_normal(10, 1000)), digits=3) +julia> round(std(Flux.glorot_normal(10, 1000)), digits = 3) 0.044f0 -julia> round(std(Flux.glorot_normal(1000, 10)), digits=3) +julia> round(std(Flux.glorot_normal(1000, 10)), digits = 3) 0.044f0 -julia> round(std(Flux.glorot_normal(1000, 1000)), digits=3) +julia> round(std(Flux.glorot_normal(1000, 1000)), digits = 3) 0.032f0 -julia> Dense(10 => 1000, tanh; init = Flux.glorot_normal(gain=100)) +julia> Dense(10 => 1000, tanh; init = Flux.glorot_normal(gain = 100)) Dense(10 => 1000, tanh) # 11_000 parameters -julia> round(std(ans.weight), sigdigits=3) +julia> round(std(ans.weight), sigdigits = 3) 4.45f0 ``` @@ -140,12 +149,16 @@ julia> round(std(ans.weight), sigdigits=3) [1] Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." _Proceedings of the thirteenth international conference on artificial intelligence and statistics_. 2010. """ -function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real=1) - std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...))) - randn(rng, Float32, dims...) .* std +function glorot_normal(rng::AbstractRNG, dims::Integer...; gain::Real = 1) + std = Float32(gain) * sqrt(2.0f0 / sum(nfan(dims...))) + return randn(rng, Float32, dims...) .* std +end +function glorot_normal(dims::Integer...; kwargs...) + return glorot_normal(default_rng_value(), dims...; kwargs...) +end +function glorot_normal(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...) end -glorot_normal(dims::Integer...; kwargs...) = glorot_normal(default_rng_value(), dims...; kwargs...) -glorot_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> glorot_normal(rng, dims...; init_kwargs..., kwargs...) ChainRulesCore.@non_differentiable glorot_normal(::Any...) @@ -159,14 +172,15 @@ on the interval `[-x, x]`, where `x = gain * sqrt(3/fan_in)` using [`nfan`](@ref This method is described in [1] and also known as He initialization. # Examples + ```jldoctest; setup = :(using Random; Random.seed!(0)) -julia> round.(extrema(Flux.kaiming_uniform(100, 10)), digits=3) +julia> round.(extrema(Flux.kaiming_uniform(100, 10)), digits = 3) (-0.774f0, 0.774f0) -julia> round.(extrema(Flux.kaiming_uniform(10, 100)), digits=3) +julia> round.(extrema(Flux.kaiming_uniform(10, 100)), digits = 3) (-0.245f0, 0.244f0) -julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits=3) +julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits = 3) (-0.245f0, 0.245f0) ``` @@ -175,12 +189,16 @@ julia> round.(extrema(Flux.kaiming_uniform(100, 100)), digits=3) [1] He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." _Proceedings of the IEEE international conference on computer vision_. 2015. """ function kaiming_uniform(rng::AbstractRNG, dims::Integer...; gain::Real = √2) - bound = Float32(√3 * gain / sqrt(first(nfan(dims...)))) # fan_in - return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound + bound = Float32(√3 * gain / sqrt(first(nfan(dims...)))) # fan_in + return (rand(rng, Float32, dims...) .- 0.5f0) .* 2bound end -kaiming_uniform(dims::Integer...; kwargs...) = kaiming_uniform(default_rng_value(), dims...; kwargs...) -kaiming_uniform(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...) +function kaiming_uniform(dims::Integer...; kwargs...) + return kaiming_uniform(default_rng_value(), dims...; kwargs...) +end +function kaiming_uniform(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims...; kwargs...) -> kaiming_uniform(rng, dims...; init_kwargs..., kwargs...) +end ChainRulesCore.@non_differentiable kaiming_uniform(::Any...) @@ -194,16 +212,17 @@ distribution standard deviation `gain / sqrt(fan_in)`, using [`nfan`](@ref Flux. This method is described in [1] and also known as He initialization. # Examples + ```jldoctest; setup = :(using Random; Random.seed!(0)) julia> using Statistics -julia> round(std(Flux.kaiming_normal(10, 1000)), digits=3) +julia> round(std(Flux.kaiming_normal(10, 1000)), digits = 3) 0.045f0 -julia> round(std(Flux.kaiming_normal(1000, 10)), digits=3) +julia> round(std(Flux.kaiming_normal(1000, 10)), digits = 3) 0.447f0 -julia> round(std(Flux.kaiming_normal(1000, 1000)), digits=3) +julia> round(std(Flux.kaiming_normal(1000, 1000)), digits = 3) 0.045f0 ``` @@ -211,20 +230,24 @@ julia> round(std(Flux.kaiming_normal(1000, 1000)), digits=3) [1] He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." _Proceedings of the IEEE international conference on computer vision_. 2015. """ -function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2f0) - std = Float32(gain / sqrt(first(nfan(dims...)))) # fan_in - return randn(rng, Float32, dims...) .* std +function kaiming_normal(rng::AbstractRNG, dims::Integer...; gain::Real = √2.0f0) + std = Float32(gain / sqrt(first(nfan(dims...)))) # fan_in + return randn(rng, Float32, dims...) .* std end -kaiming_normal(dims::Integer...; kwargs...) = kaiming_normal(default_rng_value(), dims...; kwargs...) -kaiming_normal(rng::AbstractRNG; init_kwargs...) = (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...) +function kaiming_normal(dims::Integer...; kwargs...) + return kaiming_normal(default_rng_value(), dims...; kwargs...) +end +function kaiming_normal(rng::AbstractRNG; init_kwargs...) + return (dims...; kwargs...) -> kaiming_normal(rng, dims...; init_kwargs..., kwargs...) +end ChainRulesCore.@non_differentiable kaiming_normal(::Any...) """ truncated_normal([rng = default_rng_value()], size...; mean = 0, std = 1, lo = -2, hi = 2) -> Array truncated_normal([rng]; kw...) -> Function - + Return an `Array{Float32}` of the given `size` where each element is drawn from a truncated normal distribution. The numbers are distributed like `filter(x -> lo<=x<=hi, mean .+ std .* randn(100))`. @@ -233,37 +256,43 @@ applying the inverse CDF of the truncated normal distribution. This method works best when `lo ≤ mean ≤ hi`. # Examples + ```jldoctest julia> using Statistics julia> Flux.truncated_normal(3, 4) |> summary "3×4 Matrix{Float32}" -julia> round.(extrema(Flux.truncated_normal(10^6)); digits=3) +julia> round.(extrema(Flux.truncated_normal(10^6)); digits = 3) (-2.0f0, 2.0f0) julia> round(std(Flux.truncated_normal(10^6; lo = -100, hi = 100))) 1.0f0 ``` """ -function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2, hi = 2) - norm_cdf(x) = 0.5 * (1 + erf(x/√2)) - if (mean < lo - 2 * std) || (mean > hi + 2 * std) - @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1 - end - l = norm_cdf((lo - mean) / std) - u = norm_cdf((hi - mean) / std) - xs = rand(rng, Float32, dims...) - broadcast!(xs, xs) do x - x = x * 2(u - l) + (2l - 1) - x = erfinv(x) - x = clamp(x * std * √2 + mean, lo, hi) - end - return xs -end - -truncated_normal(dims::Integer...; kwargs...) = truncated_normal(default_rng_value(), dims...; kwargs...) -truncated_normal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...) +function truncated_normal(rng::AbstractRNG, dims::Integer...; mean = 0, std = 1, lo = -2, + hi = 2) + norm_cdf(x) = 0.5 * (1 + erf(x / √2)) + if (mean < lo - 2 * std) || (mean > hi + 2 * std) + @warn "Mean is more than 2 std outside the limits in truncated_normal, so the distribution of values may be inaccurate." maxlog=1 + end + l = norm_cdf((lo - mean) / std) + u = norm_cdf((hi - mean) / std) + xs = rand(rng, Float32, dims...) + broadcast!(xs, xs) do x + x = x * 2(u - l) + (2l - 1) + x = erfinv(x) + return x = clamp(x * std * √2 + mean, lo, hi) + end + return xs +end + +function truncated_normal(dims::Integer...; kwargs...) + return truncated_normal(default_rng_value(), dims...; kwargs...) +end +function truncated_normal(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims...; kwargs...) -> truncated_normal(rng, dims...; init_kwargs..., kwargs...) +end ChainRulesCore.@non_differentiable truncated_normal(::Any...) @@ -278,6 +307,7 @@ For `length(size) > 2`, a `prod(size[1:(end - 1)])` by `size[end]` orthogonal ma is computed before reshaping it to the original dimensions. # Examples + ```jldoctest; setup = :(using LinearAlgebra) julia> W = Flux.orthogonal(5, 7); @@ -304,27 +334,31 @@ true # References [1] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", ICLR 2014, https://arxiv.org/abs/1312.6120 - """ function orthogonal(rng::AbstractRNG, rows::Integer, cols::Integer; gain::Real = 1) - if rows < cols - return permutedims(orthogonal(rng, cols, rows; gain)) - end - mat = randn(rng, Float32, rows, cols) - Q, R = LinearAlgebra.qr(mat) - mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* Float32(gain) - return mat + if rows < cols + return permutedims(orthogonal(rng, cols, rows; gain)) + end + mat = randn(rng, Float32, rows, cols) + Q, R = LinearAlgebra.qr(mat) + mat .= Array(Q) * sign.(LinearAlgebra.Diagonal(R)) .* Float32(gain) + return mat end function orthogonal(rng::AbstractRNG, d1::Integer, ds::Integer...; kwargs...) - dims = (d1, ds...) - rows = prod(dims[1:end-1]) - cols = dims[end] - return reshape(orthogonal(rng, rows, cols; kwargs...), dims) + dims = (d1, ds...) + rows = prod(dims[1:(end - 1)]) + cols = dims[end] + return reshape(orthogonal(rng, rows, cols; kwargs...), dims) end -orthogonal(dims::Integer...; kwargs...) = orthogonal(default_rng_value(), dims...; kwargs...) -orthogonal(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., kwargs...) +function orthogonal(dims::Integer...; kwargs...) + return orthogonal(default_rng_value(), dims...; kwargs...) +end +function orthogonal(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims::Integer...; kwargs...) -> orthogonal(rng, dims...; init_kwargs..., + kwargs...) +end ChainRulesCore.@non_differentiable orthogonal(::Any...) @@ -339,18 +373,19 @@ with a mean of zero and standard deviation `std`. This method is described in [1]. # Examples + ```jldoctest; setup = :(using Random; Random.seed!(0)) -julia> count(iszero, Flux.sparse_init(10, 10, sparsity=1/5)) +julia> count(iszero, Flux.sparse_init(10, 10, sparsity = 1 / 5)) 20 -julia> sum(0 .== Flux.sparse_init(10, 11, sparsity=0.9), dims=1) +julia> sum(0 .== Flux.sparse_init(10, 11, sparsity = 0.9), dims = 1) 1×11 Matrix{Int64}: 9 9 9 9 9 9 9 9 9 9 9 -julia> Dense(3 => 10, tanh; init=Flux.sparse_init(sparsity=0.5)) +julia> Dense(3 => 10, tanh; init = Flux.sparse_init(sparsity = 0.5)) Dense(3 => 10, tanh) # 40 parameters -julia> count(iszero, ans.weight, dims=1) +julia> count(iszero, ans.weight, dims = 1) 1×3 Matrix{Int64}: 5 5 5 ``` @@ -360,19 +395,23 @@ julia> count(iszero, ans.weight, dims=1) [1] Martens, J, "Deep learning via Hessian-free optimization" _Proceedings of the 27th International Conference on International Conference on Machine Learning_. 2010. """ function sparse_init(rng::AbstractRNG, dims::Integer...; sparsity, std = 0.01) - if length(dims) != 2 - throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) - end - rows, cols = dims - prop_zero = min(1.0, sparsity) - num_zeros = ceil(Integer, prop_zero * rows) - sparse_array = randn(rng, Float32, dims...) .* Float32(std) - sparse_array[1:num_zeros, :] .= 0f0 - return mapslices(shuffle, sparse_array, dims=1) + if length(dims) != 2 + throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization.")) + end + rows, cols = dims + prop_zero = min(1.0, sparsity) + num_zeros = ceil(Integer, prop_zero * rows) + sparse_array = randn(rng, Float32, dims...) .* Float32(std) + sparse_array[1:num_zeros, :] .= 0.0f0 + return mapslices(shuffle, sparse_array, dims = 1) end -sparse_init(dims::Integer...; kwargs...) = sparse_init(default_rng_value(), dims...; kwargs...) -sparse_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...) +function sparse_init(dims::Integer...; kwargs...) + return sparse_init(default_rng_value(), dims...; kwargs...) +end +function sparse_init(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (dims...; kwargs...) -> sparse_init(rng, dims...; init_kwargs..., kwargs...) +end ChainRulesCore.@non_differentiable sparse_init(::Any...) @@ -387,20 +426,21 @@ Often useful in the context of transfer learning, i.e when one wants to add more a model but start from the same mapping. Has the following behaviour -* 1D: A `Vector` of `zeros` (useful for an identity bias) -* 2D: An identity matrix (useful for an identity matrix multiplication) -* More than 2D: A dense block array of center tap spatial filters (useful for an identity convolution) -Some caveats: -* Not all layers will be identity mapping when used with this init. Exceptions - include recurrent layers and normalization layers. + - 1D: A `Vector` of `zeros` (useful for an identity bias) + - 2D: An identity matrix (useful for an identity matrix multiplication) + - More than 2D: A dense block array of center tap spatial filters (useful for an identity convolution) + +Some caveats: -* Layers must have `input_size == output_size` for identity mapping to be - possible. When this is not the case, extra dimensions of the array are padded with zeros. + - Not all layers will be identity mapping when used with this init. Exceptions + include recurrent layers and normalization layers. -* For convolutional layers, in addition to the above, the kernel sizes must also be odd and - padding must be applied so that output feature maps have the same size as input feature maps, - e.g by using [`SamePad`](@ref). + - Layers must have `input_size == output_size` for identity mapping to be + possible. When this is not the case, extra dimensions of the array are padded with zeros. + - For convolutional layers, in addition to the above, the kernel sizes must also be odd and + padding must be applied so that output feature maps have the same size as input feature maps, + e.g by using [`SamePad`](@ref). Use keyword `shift` (integer or tuple) to apply circular shift to the output, equivalent to `Base.circshift(identity_init(size...), shift)`. @@ -409,20 +449,21 @@ For consistency with other initialisers, it accepts `rng::AbstractRNG` as an opt first argument. But this is ignored, since the result is not random. # Examples + ```jldoctest -julia> Flux.identity_init(3,5) +julia> Flux.identity_init(3, 5) 3×5 Matrix{Float32}: 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 -julia> Dense(5 => 3, relu, init=Flux.identity_init)([1,-2,3,-4,5]) +julia> Dense(5 => 3, relu, init = Flux.identity_init)([1, -2, 3, -4, 5]) 3-element Vector{Float32}: 1.0 0.0 3.0 -julia> Flux.identity_init(3,3,2; gain=100) +julia> Flux.identity_init(3, 3, 2; gain = 100) 3×3×2 Array{Float32, 3}: [:, :, 1] = 0.0 0.0 0.0 @@ -434,35 +475,42 @@ julia> Flux.identity_init(3,3,2; gain=100) 0.0 100.0 0.0 0.0 0.0 0.0 -julia> x4 = cat([1 2 3; 4 5 6; 7 8 9]; dims=4); +julia> x4 = cat([1 2 3; 4 5 6; 7 8 9]; dims = 4); -julia> Conv((2,2), 1 => 1, init=Flux.identity_init(gain=10), pad=SamePad())(x4) +julia> Conv((2, 2), 1 => 1, init = Flux.identity_init(gain = 10), pad = SamePad())(x4) 3×3×1×1 Array{Float32, 4}: [:, :, 1, 1] = 10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0 90.0 +``` # Assume bias ``` """ -identity_init(cols::Integer; gain::Real=1, shift=0) = zeros32(cols) # Assume bias +identity_init(cols::Integer; gain::Real = 1, shift = 0) = zeros32(cols) # Assume bias # Assume matrix multiplication -identity_init(rows::Integer, cols::Integer; gain::Real=1, shift=0) = circshift(Matrix{Float32}(I * gain, rows,cols), shift) +function identity_init(rows::Integer, cols::Integer; gain::Real = 1, shift = 0) + return circshift(Matrix{Float32}(I * gain, rows, cols), shift) +end # Assume convolution -function identity_init(dims::Integer...; gain::Real=1, shift=0) - nin, nout = dims[end-1], dims[end] - centers = map(d -> cld(d, 2), dims[1:end-2]) - weights = zeros32(dims...) - for i in 1:min(nin,nout) - weights[centers..., i, i] = gain - end - return circshift(weights, shift) +function identity_init(dims::Integer...; gain::Real = 1, shift = 0) + nin, nout = dims[end - 1], dims[end] + centers = map(d -> cld(d, 2), dims[1:(end - 2)]) + weights = zeros32(dims...) + for i in 1:min(nin, nout) + weights[centers..., i, i] = gain + end + return circshift(weights, shift) end # For consistency, it accepts an RNG, but ignores it: -identity_init(::AbstractRNG, dims::Integer...; kwargs...) = identity_init(dims...; kwargs...) -identity_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (args...;kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...) +function identity_init(::AbstractRNG, dims::Integer...; kwargs...) + return identity_init(dims...; kwargs...) +end +function identity_init(rng::AbstractRNG = default_rng_value(); init_kwargs...) + return (args...; kwargs...) -> identity_init(rng, args...; init_kwargs..., kwargs...) +end ChainRulesCore.@non_differentiable identity_init(::Any...) @@ -509,20 +557,20 @@ randn32(rng::AbstractRNG) = (dims...,) -> Base.randn(rng, Float32, dims...) Return a bias parameter for a layer, based on the value given to the constructor's keyword `bias=bias`. -* `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero. -* `bias == false` returns `false`, which is understood by AD to be non-differentiable. -* `bias::AbstractArray` uses the array provided, provided it has the correct size. - It does not at present correct the `eltype` to match that of `weights`. + - `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero. + - `bias == false` returns `false`, which is understood by AD to be non-differentiable. + - `bias::AbstractArray` uses the array provided, provided it has the correct size. + It does not at present correct the `eltype` to match that of `weights`. """ function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...) - bias ? fill!(similar(weights, dims...), 0) : false + return bias ? fill!(similar(weights, dims...), 0) : false end function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...) - size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))")) - bias + size(bias) == dims || + throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))")) + return bias end - # Other """ @@ -537,10 +585,11 @@ execution on the leading edge, pass `leading=false`. To enable execution on the trailing edge, pass `trailing=true`. # Examples + ```jldoctest julia> a = Flux.throttle(() -> println("Flux"), 2); -julia> for i = 1:4 # a called in alternate iterations +julia> for i in 1:4 # a called in alternate iterations a() sleep(1) end @@ -548,39 +597,38 @@ Flux Flux ``` """ -function throttle(f, timeout; leading=true, trailing=false) - cooldown = true - later = nothing - result = nothing - - function throttled(args...; kwargs...) - yield() - - if cooldown - if leading - result = f(args...; kwargs...) - else - later = () -> f(args...; kwargs...) - end - - cooldown = false - @async try - while (sleep(timeout); later != nothing) - later() - later = nothing +function throttle(f, timeout; leading = true, trailing = false) + cooldown = true + later = nothing + result = nothing + + function throttled(args...; kwargs...) + yield() + + if cooldown + if leading + result = f(args...; kwargs...) + else + later = () -> f(args...; kwargs...) + end + + cooldown = false + @async try + while (sleep(timeout); later != nothing) + later() + later = nothing + end + finally + cooldown = true + end + elseif trailing + later = () -> (result = f(args...; kwargs...)) end - finally - cooldown = true - end - elseif trailing - later = () -> (result = f(args...; kwargs...)) - end - return result - end + return result + end end - """ modules(m) @@ -628,7 +676,7 @@ modules(m) = [x for x in Functors.fcollect(m) if !isleaflike(x)] @nograd modules # TODO: is this correct? might fail with explicit parameters. function ChainRulesCore.rrule(::typeof(modules), m) - modules(m), dm -> error("Flux.modules is not at present differentiable, sorry") + return modules(m), dm -> error("Flux.modules is not at present differentiable, sorry") end isleaflike(x) = Functors.isleaf(x) @@ -644,15 +692,15 @@ If the count is greater than or equal to `wait`, the function returns `true`, otherwise it returns `false`. # Examples + ```jldoctest julia> loss() = rand(); julia> trigger = Flux.patience(() -> loss() < 1, 3); - julia> for i in 1:10 - @info "Epoch \$i" - trigger() && break + @info "Epoch \$i" + trigger() && break end [ Info: Epoch 1 [ Info: Epoch 2 @@ -660,13 +708,13 @@ julia> for i in 1:10 ``` """ function patience(predicate, wait) - let count = 0 - function on_trigger(args...; kwargs...) - count = predicate(args...; kwargs...) ? count + 1 : 0 + let count = 0 + function on_trigger(args...; kwargs...) + count = predicate(args...; kwargs...) ? count + 1 : 0 - return count >= wait + return count >= wait + end end - end end """ @@ -680,17 +728,17 @@ the function returns `true`, otherwise it returns `false`. The count is reset when `distance(best_score, f(...)) > min_dist`. # Examples + ```jldoctest julia> loss = let l = 0 - () -> l += 1 + () -> l += 1 end; # pseudo loss function that returns increasing values julia> es = Flux.early_stopping(loss, 3); - julia> for i in 1:10 - @info "Epoch \$i" - es() && break + @info "Epoch \$i" + es() && break end [ Info: Epoch 1 [ Info: Epoch 2 @@ -698,17 +746,17 @@ julia> for i in 1:10 ``` """ function early_stopping(f, delay; distance = -, init_score = 0, min_dist = 0) - trigger = let best_score = init_score - (args...; kwargs...) -> begin - score = f(args...; kwargs...) - Δ = distance(best_score, score) - best_score = Δ < 0 ? best_score : score + trigger = let best_score = init_score + (args...; kwargs...) -> begin + score = f(args...; kwargs...) + Δ = distance(best_score, score) + best_score = Δ < 0 ? best_score : score - return Δ < min_dist + return Δ < min_dist + end end - end - return patience(trigger, delay) + return patience(trigger, delay) end """ @@ -722,17 +770,17 @@ the function returns `true`, otherwise it returns `false`. The count is reset when `abs(distance(last_score, f(...))) > min_dist`. # Examples + ```jldoctest julia> f = let v = 10 - () -> v = v / abs(v) - v + () -> v = v / abs(v) - v end; # -9, 8, -7, 6, ... -julia> trigger = Flux.plateau(f, 3; init_score=10, min_dist=18); - +julia> trigger = Flux.plateau(f, 3; init_score = 10, min_dist = 18); julia> for i in 1:10 - @info "Epoch \$i" - trigger() && break + @info "Epoch \$i" + trigger() && break end [ Info: Epoch 1 [ Info: Epoch 2 @@ -740,16 +788,16 @@ julia> for i in 1:10 [ Info: Epoch 4 ``` """ -function plateau(f, width; distance = -, init_score = 0, min_dist = 1f-6) - is_plateau = let last_score = init_score - (args...; kwargs...) -> begin - score = f(args...; kwargs...) - Δ = abs(distance(last_score, score)) - last_score = score +function plateau(f, width; distance = -, init_score = 0, min_dist = 1.0f-6) + is_plateau = let last_score = init_score + (args...; kwargs...) -> begin + score = f(args...; kwargs...) + Δ = abs(distance(last_score, score)) + last_score = score - return Δ < min_dist + return Δ < min_dist + end end - end - return patience(is_plateau, width) + return patience(is_plateau, width) end diff --git a/test/ctc-gpu.jl b/test/ctc-gpu.jl index d7ff1bdf9d..1ed898cd21 100644 --- a/test/ctc-gpu.jl +++ b/test/ctc-gpu.jl @@ -8,49 +8,52 @@ using CUDA # Custom function to check numerical gradient of ctc loss, # based on `ngradient` in `Tracker.jl` function ctc_ngradient(x, y) - f = Flux.Losses.ctc_loss - grads = zero(x) - for i in 1:length(x) - δ = sqrt(eps()) - tmp = x[i] - x[i] = tmp - δ/2 - y1 = f(x, y) - x[i] = tmp + δ/2 - y2 = f(x, y) - x[i] = tmp - grads[i] = (y2-y1)/δ - end - return grads + f = Flux.Losses.ctc_loss + grads = zero(x) + for i in 1:length(x) + δ = sqrt(eps()) + tmp = x[i] + x[i] = tmp - δ / 2 + y1 = f(x, y) + x[i] = tmp + δ / 2 + y2 = f(x, y) + x[i] = tmp + grads[i] = (y2 - y1) / δ + end + return grads end @testset "ctc-gpu" begin - x = rand(10, 50) - y = rand(1:9, 30) - x_cu = CuArray(x) - g1 = gradient(ctc_loss, x_cu, y)[1] - g1 = g1 |> collect - g2 = ctc_ngradient(x, y) - @test g1 ≈ g2 rtol=1e-5 atol=1e-5 - - # test that GPU loss matches CPU implementation - l1 = ctc_loss(x_cu, y) - l2 = ctc_loss(x, y) - @test l1 ≈ l2 - - # tests using hand-calculated values - x_cu = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] |> CuArray - y = [1, 2] - @test ctc_loss(x_cu, y) ≈ 3.6990738275138035 - - g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457] - ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g ≈ ghat rtol=1e-5 atol=1e-5 + x = rand(10, 50) + y = rand(1:9, 30) + x_cu = CuArray(x) + g1 = gradient(ctc_loss, x_cu, y)[1] + g1 = g1 |> collect + g2 = ctc_ngradient(x, y) + @test g1≈g2 rtol=1e-5 atol=1e-5 - x_cu = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] |> CuArray - y = [1, 2] |> CuArray - @test ctc_loss(x_cu, y) ≈ 8.02519869363453 + # test that GPU loss matches CPU implementation + l1 = ctc_loss(x_cu, y) + l2 = ctc_loss(x, y) + @test l1 ≈ l2 - g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] - ghat = gradient(ctc_loss, x_cu, y)[1] |> collect - @test g ≈ ghat rtol=1e-5 atol=1e-5 + # tests using hand-calculated values + x_cu = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0] |> CuArray + y = [1, 2] + @test ctc_loss(x_cu, y) ≈ 3.6990738275138035 + + g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; + 0.0729422 0.447346 0.16457] + ghat = gradient(ctc_loss, x_cu, y)[1] |> collect + @test g≈ghat rtol=1e-5 atol=1e-5 + + x_cu = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] |> CuArray + y = [1, 2] |> CuArray + @test ctc_loss(x_cu, y) ≈ 8.02519869363453 + + g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] + ghat = gradient(ctc_loss, x_cu, y)[1] |> collect + @test g≈ghat rtol=1e-5 atol=1e-5 end diff --git a/test/ctc.jl b/test/ctc.jl index 6fa33c4b99..88386ff0e7 100644 --- a/test/ctc.jl +++ b/test/ctc.jl @@ -7,42 +7,45 @@ using LinearAlgebra # Custom function to check numerical gradient of ctc loss, # based on `ngradient` in `Tracker.jl` function ctc_ngradient(x, y) - f = Flux.Losses.ctc_loss - grads = zero(x) - for i in 1:length(x) - δ = sqrt(eps()) - tmp = x[i] - x[i] = tmp - δ/2 - y1 = f(x, y) - x[i] = tmp + δ/2 - y2 = f(x, y) - x[i] = tmp - grads[i] = (y2-y1)/δ - end - return grads + f = Flux.Losses.ctc_loss + grads = zero(x) + for i in 1:length(x) + δ = sqrt(eps()) + tmp = x[i] + x[i] = tmp - δ / 2 + y1 = f(x, y) + x[i] = tmp + δ / 2 + y2 = f(x, y) + x[i] = tmp + grads[i] = (y2 - y1) / δ + end + return grads end @testset "ctc_loss" begin - x = rand(10, 50) - y = rand(1:9, 30) - g1 = gradient(ctc_loss, x, y)[1] - g2 = ctc_ngradient(x, y) - @test g1 ≈ g2 rtol=1e-5 atol=1e-5 - - # tests using hand-calculated values - x = [1. 2. 3.; 2. 1. 1.; 3. 3. 2.] - y = [1, 2] - @test ctc_loss(x, y) ≈ 3.6990738275138035 + x = rand(10, 50) + y = rand(1:9, 30) + g1 = gradient(ctc_loss, x, y)[1] + g2 = ctc_ngradient(x, y) + @test g1≈g2 rtol=1e-5 atol=1e-5 - g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; 0.0729422 0.447346 0.16457] - ghat = gradient(ctc_loss, x, y)[1] - @test g ≈ ghat rtol=1e-5 atol=1e-5 + # tests using hand-calculated values + x = [1.0 2.0 3.0; 2.0 1.0 1.0; 3.0 3.0 2.0] + y = [1, 2] + @test ctc_loss(x, y) ≈ 3.6990738275138035 - x = [-3. 12. 8. 15.; 4. 20. -2. 20.; 8. -33. 6. 5.] - y = [1, 2] - @test ctc_loss(x, y) ≈ 8.02519869363453 + g = [-0.317671 -0.427729 0.665241; 0.244728 -0.0196172 -0.829811; + 0.0729422 0.447346 0.16457] + ghat = gradient(ctc_loss, x, y)[1] + @test g≈ghat rtol=1e-5 atol=1e-5 - g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] - ghat = gradient(ctc_loss, x, y)[1] - @test g ≈ ghat rtol=1e-5 atol=1e-5 + x = [-3.0 12.0 8.0 15.0; 4.0 20.0 -2.0 20.0; 8.0 -33.0 6.0 5.0] + y = [1, 2] + @test ctc_loss(x, y) ≈ 8.02519869363453 + + g = [-2.29294774655333e-06 -0.999662657278862 1.75500863563993e-06 0.00669284889063; + 0.017985914969696 0.999662657278861 -1.9907078755387e-06 -0.006693150917307; + -0.01798362202195 -2.52019580677916e-20 2.35699239251042e-07 3.02026677058789e-07] + ghat = gradient(ctc_loss, x, y)[1] + @test g≈ghat rtol=1e-5 atol=1e-5 end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index 2b4fec6e4c..c20236bec6 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -6,163 +6,163 @@ using LinearAlgebra: I, cholesky, Cholesky using SparseArrays: sparse, SparseMatrixCSC, AbstractSparseArray @testset "CUDA" begin - x = randn(5, 5) - cx = gpu(x) - @test cx isa CuArray + x = randn(5, 5) + cx = gpu(x) + @test cx isa CuArray - @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3 + @test Flux.onecold(gpu([1.0, 2.0, 3.0])) == 3 - x = Flux.onehotbatch([1, 2, 3], 1:3) - cx = gpu(x) - @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray - @test (cx .+ 1) isa CuArray + x = Flux.onehotbatch([1, 2, 3], 1:3) + cx = gpu(x) + @test cx isa Flux.OneHotMatrix && cx.indices isa CuArray + @test (cx .+ 1) isa CuArray - m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) - cm = gpu(m) + m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) + cm = gpu(m) - @test all(p isa CuArray for p in params(cm)) - @test cm(gpu(rand(10, 10))) isa CuArray{Float32,2} + @test all(p isa CuArray for p in params(cm)) + @test cm(gpu(rand(10, 10))) isa CuArray{Float32, 2} - xs = rand(5, 5) - ys = Flux.onehotbatch(1:5,1:5) - @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) + xs = rand(5, 5) + ys = Flux.onehotbatch(1:5, 1:5) + @test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) - c = gpu(Conv((2,2),3=>4)) - x = gpu(rand(10, 10, 3, 2)) - l = c(gpu(rand(10,10,3,2))) - @test gradient(x -> sum(c(x)), x)[1] isa CuArray - - c = gpu(CrossCor((2,2),3=>4)) - x = gpu(rand(10, 10, 3, 2)) - l = c(gpu(rand(10,10,3,2))) - @test gradient(x -> sum(c(x)), x)[1] isa CuArray + c = gpu(Conv((2, 2), 3 => 4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10, 10, 3, 2))) + @test gradient(x -> sum(c(x)), x)[1] isa CuArray + c = gpu(CrossCor((2, 2), 3 => 4)) + x = gpu(rand(10, 10, 3, 2)) + l = c(gpu(rand(10, 10, 3, 2))) + @test gradient(x -> sum(c(x)), x)[1] isa CuArray end @testset "onehot gpu" begin - y = Flux.onehotbatch(ones(3), 1:2) |> gpu; - @test (repr("text/plain", y); true) - - gA = rand(3, 2) |> gpu; - @test gradient(A -> sum(A * y), gA)[1] isa CuArray - - # construct from CuArray - x = [1, 3, 2] - y = Flux.onehotbatch(x, 0:3) - @test_skip begin # https://github.com/FluxML/OneHotArrays.jl/issues/16 - y2 = Flux.onehotbatch(x |> gpu, 0:3) - @test y2.indices isa CuArray - @test y2 |> cpu == y - end + y = Flux.onehotbatch(ones(3), 1:2) |> gpu + @test (repr("text/plain", y); true) + + gA = rand(3, 2) |> gpu + @test gradient(A -> sum(A * y), gA)[1] isa CuArray + + # construct from CuArray + x = [1, 3, 2] + y = Flux.onehotbatch(x, 0:3) + @test_skip begin # https://github.com/FluxML/OneHotArrays.jl/issues/16 + y2 = Flux.onehotbatch(x |> gpu, 0:3) + @test y2.indices isa CuArray + @test y2 |> cpu == y + end end @testset "onecold gpu" begin - y = Flux.onehotbatch(ones(3), 1:10) |> gpu; - l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] - @test Flux.onecold(y) isa CuArray - @test y[3,:] isa CuArray - @test Flux.onecold(y, l) == ['a', 'a', 'a'] + y = Flux.onehotbatch(ones(3), 1:10) |> gpu + l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + @test Flux.onecold(y) isa CuArray + @test y[3, :] isa CuArray + @test Flux.onecold(y, l) == ['a', 'a', 'a'] end @testset "onehot forward map to broadcast" begin - oa = OneHotArray(rand(1:10, 5, 5), 10) |> gpu - @test all(map(identity, oa) .== oa) - @test all(map(x -> 2 * x, oa) .== 2 .* oa) + oa = OneHotArray(rand(1:10, 5, 5), 10) |> gpu + @test all(map(identity, oa) .== oa) + @test all(map(x -> 2 * x, oa) .== 2 .* oa) end @testset "restructure gpu" begin - dudt = Dense(1,1) |> gpu - p,re = Flux.destructure(dudt) - foo(x) = sum(re(p)(x)) - @test gradient(foo, cu(rand(1)))[1] isa CuArray + dudt = Dense(1, 1) |> gpu + p, re = Flux.destructure(dudt) + foo(x) = sum(re(p)(x)) + @test gradient(foo, cu(rand(1)))[1] isa CuArray end @testset "GPU functors" begin - @testset "Cholesky" begin - M = 2.0*I(10) |> collect - Q = cholesky(M) - Q_gpu = Q |> gpu - @test Q_gpu isa Cholesky{<:Any,<:CuArray} - Q_cpu = Q_gpu |> cpu - @test Q_cpu == cholesky(eltype(Q_gpu).(M)) - end - - @testset "isbits array types" begin - struct SimpleBits - field::Int32 + @testset "Cholesky" begin + M = 2.0 * I(10) |> collect + Q = cholesky(M) + Q_gpu = Q |> gpu + @test Q_gpu isa Cholesky{<:Any, <:CuArray} + Q_cpu = Q_gpu |> cpu + @test Q_cpu == cholesky(eltype(Q_gpu).(M)) + end + + @testset "isbits array types" begin + struct SimpleBits + field::Int32 + end + + @test gpu((; a = ones(1))).a isa CuVector{Float32} + @test gpu((; a = ['a', 'b', 'c'])).a isa CuVector{Char} + @test gpu((; a = [SimpleBits(1)])).a isa CuVector{SimpleBits} end - - @test gpu((;a=ones(1))).a isa CuVector{Float32} - @test gpu((;a=['a', 'b', 'c'])).a isa CuVector{Char} - @test gpu((;a=[SimpleBits(1)])).a isa CuVector{SimpleBits} - end end @testset "gpu(cpu(x)) inside gradient" begin - a = randn(Float32, 4, 4) - ca = cu(a) - - # Trivial functions - @test gradient(x -> sum(abs, gpu(x)), a)[1] isa Matrix - @test gradient(x -> sum(gpu(x)), a)[1] isa Matrix - @test_skip gradient(x -> sum(gpu(x)), a')[1] isa Matrix # sum(::Adjoint{T,CuArray}) makes a Fill - @test gradient(x -> sum(abs, cpu(x)), ca)[1] isa CuArray - # This test should really not go through indirections and pull out Fills for efficiency - # but we forcefully materialise. TODO: remove materialising CuArray here - @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible - @test gradient(x -> sum(cpu(x)), ca')[1] isa LinearAlgebra.Adjoint - - # Even more trivial: no movement - @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix - @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix - @test gradient(x -> sum(cpu(x)), a)[1] isa typeof(gradient(sum, a)[1]) # FillArray - @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray - @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray # KernelError: passing and using non-bitstype argument - - # More complicated, Array * CuArray is an error - g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1] - @test g0 ≈ gradient(x -> sum(abs, cpu(ca * gpu(a * x))), a)[1] - @test cu(g0) ≈ gradient(x -> sum(abs, gpu(a * cpu(ca * x))), ca)[1] - @test gradient(x -> sum(gpu(cpu(x))), a)[1] isa Matrix - @test gradient(x -> sum(gpu(cpu(x))), ca)[1] isa CuArray - - g4 = gradient(x -> sum(a * (a' * x)), a)[1] # no abs, one adjoint - @test g4 ≈ gradient(x -> sum(cpu(ca * gpu(a' * x))), a)[1] - @test cu(g4) ≈ gradient(x -> sum(gpu(a * cpu(ca' * x))), ca)[1] - - # Scalar indexing of an array, needs OneElement to transfer to GPU - # https://github.com/FluxML/Zygote.jl/issues/1005 - @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1,2,3]) == ([2,0,0],) - @test gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9]) == ([2 6 8; 0 2 0; 0 3 0],) + a = randn(Float32, 4, 4) + ca = cu(a) + + # Trivial functions + @test gradient(x -> sum(abs, gpu(x)), a)[1] isa Matrix + @test gradient(x -> sum(gpu(x)), a)[1] isa Matrix + @test_skip gradient(x -> sum(gpu(x)), a')[1] isa Matrix # sum(::Adjoint{T,CuArray}) makes a Fill + @test gradient(x -> sum(abs, cpu(x)), ca)[1] isa CuArray + # This test should really not go through indirections and pull out Fills for efficiency + # but we forcefully materialise. TODO: remove materialising CuArray here + @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible + @test gradient(x -> sum(cpu(x)), ca')[1] isa LinearAlgebra.Adjoint + + # Even more trivial: no movement + @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix + @test gradient(x -> sum(abs, cpu(x)), a')[1] isa Matrix + @test gradient(x -> sum(cpu(x)), a)[1] isa typeof(gradient(sum, a)[1]) # FillArray + @test gradient(x -> sum(abs, gpu(x)), ca)[1] isa CuArray + @test_skip gradient(x -> sum(abs, gpu(x)), ca')[1] isa CuArray # KernelError: passing and using non-bitstype argument + + # More complicated, Array * CuArray is an error + g0 = gradient(x -> sum(abs, (a * (a * x))), a)[1] + @test g0 ≈ gradient(x -> sum(abs, cpu(ca * gpu(a * x))), a)[1] + @test cu(g0) ≈ gradient(x -> sum(abs, gpu(a * cpu(ca * x))), ca)[1] + @test gradient(x -> sum(gpu(cpu(x))), a)[1] isa Matrix + @test gradient(x -> sum(gpu(cpu(x))), ca)[1] isa CuArray + + g4 = gradient(x -> sum(a * (a' * x)), a)[1] # no abs, one adjoint + @test g4 ≈ gradient(x -> sum(cpu(ca * gpu(a' * x))), a)[1] + @test cu(g4) ≈ gradient(x -> sum(gpu(a * cpu(ca' * x))), ca)[1] + + # Scalar indexing of an array, needs OneElement to transfer to GPU + # https://github.com/FluxML/Zygote.jl/issues/1005 + @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1, 2, 3]) == ([2, 0, 0],) + @test gradient(x -> cpu(gpu(x) * gpu(x))[1, 2], Float32[1 2 3; 4 5 6; 7 8 9]) == + ([2 6 8; 0 2 0; 0 3 0],) end @testset "gpu(x) and cpu(x) on structured arrays" begin - @test cpu(1:3) isa UnitRange - @test cpu(range(1, 3, length = 4)) isa AbstractRange - - # OneElement isn't GPU compatible - g1 = Zygote.OneElement(1, (2,3), axes(ones(4,5))) - @test cpu(g1) isa Zygote.OneElement - - g2 = Zygote.Fill(1f0, 2) - @test cpu(g2) isa Zygote.FillArrays.AbstractFill - - g3 = transpose(Float32[1 2; 3 4]) - @test parent(cpu(g3)) isa Matrix{Float32} - - @testset "Sparse Arrays" begin - @test cpu(sparse(rand(3,3))) isa SparseMatrixCSC - a = sparse(rand(3,3)) - @test cpu(a) === a - @test gpu(sparse(rand(3,3))) isa CUDA.CUSPARSE.CuSparseMatrixCSC - end - - # Check that gpu() converts these to CuArrays. This a side-effect of using the same functions - # in gpu() as in the gradient of cpu(). A different design could avoid having gpu() used alone - # move these, if that turns out to be desirable. - @test gpu(g1) isa CuArray - @test gpu(g1) ≈ cu(Matrix(g1)) - @test gpu(g2) isa CuArray - @test gpu(g2) ≈ cu(Vector(g2)) - @test parent(gpu(g3)) isa CuArray + @test cpu(1:3) isa UnitRange + @test cpu(range(1, 3, length = 4)) isa AbstractRange + + # OneElement isn't GPU compatible + g1 = Zygote.OneElement(1, (2, 3), axes(ones(4, 5))) + @test cpu(g1) isa Zygote.OneElement + + g2 = Zygote.Fill(1.0f0, 2) + @test cpu(g2) isa Zygote.FillArrays.AbstractFill + + g3 = transpose(Float32[1 2; 3 4]) + @test parent(cpu(g3)) isa Matrix{Float32} + + @testset "Sparse Arrays" begin + @test cpu(sparse(rand(3, 3))) isa SparseMatrixCSC + a = sparse(rand(3, 3)) + @test cpu(a) === a + @test gpu(sparse(rand(3, 3))) isa CUDA.CUSPARSE.CuSparseMatrixCSC + end + + # Check that gpu() converts these to CuArrays. This a side-effect of using the same functions + # in gpu() as in the gradient of cpu(). A different design could avoid having gpu() used alone + # move these, if that turns out to be desirable. + @test gpu(g1) isa CuArray + @test gpu(g1) ≈ cu(Matrix(g1)) + @test gpu(g2) isa CuArray + @test gpu(g2) ≈ cu(Vector(g2)) + @test parent(gpu(g3)) isa CuArray end diff --git a/test/cuda/curnn.jl b/test/cuda/curnn.jl index 5c460d2aa4..2ff137d34f 100644 --- a/test/cuda/curnn.jl +++ b/test/cuda/curnn.jl @@ -1,26 +1,25 @@ using Flux, CUDA, Test @testset for R in [RNN, GRU, LSTM, GRUv3] - m = R(10, 5) |> gpu - x = gpu(rand(10)) - (m̄,) = gradient(m -> sum(m(x)), m) - Flux.reset!(m) - θ = gradient(() -> sum(m(x)), params(m)) - @test x isa CuArray - @test θ[m.cell.Wi] isa CuArray - @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi]) + m = R(10, 5) |> gpu + x = gpu(rand(10)) + (m̄,) = gradient(m -> sum(m(x)), m) + Flux.reset!(m) + θ = gradient(() -> sum(m(x)), params(m)) + @test x isa CuArray + @test θ[m.cell.Wi] isa CuArray + @test collect(m̄.cell.Wi) == collect(θ[m.cell.Wi]) end -@testset "RNN" begin - @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) +@testset "RNN" begin @testset for R in [RNN, GRU, LSTM, GRUv3], batch_size in (1, 5) rnn = R(10, 5) curnn = fmap(gpu, rnn) Flux.reset!(rnn) Flux.reset!(curnn) x = batch_size == 1 ? - rand(Float32, 10) : - rand(Float32, 10, batch_size) + rand(Float32, 10) : + rand(Float32, 10, batch_size) cux = gpu(x) y, back = pullback((r, x) -> r(x), rnn, x) @@ -37,18 +36,18 @@ end @test m̄[].cell.Wh ≈ collect(cum̄[].cell.Wh) @test m̄[].cell.b ≈ collect(cum̄[].cell.b) if m̄[].state isa Tuple - for (x, cx) in zip(m̄[].state, cum̄[].state) - @test x ≈ collect(cx) - end + for (x, cx) in zip(m̄[].state, cum̄[].state) + @test x ≈ collect(cx) + end else - @test m̄[].state ≈ collect(cum̄[].state) + @test m̄[].state ≈ collect(cum̄[].state) end Flux.reset!(rnn) Flux.reset!(curnn) ohx = batch_size == 1 ? - Flux.onehot(rand(1:10), 1:10) : - Flux.onehotbatch(rand(1:10, batch_size), 1:10) + Flux.onehot(rand(1:10), 1:10) : + Flux.onehotbatch(rand(1:10, batch_size), 1:10) cuohx = gpu(ohx) y = (rnn(ohx); rnn(ohx)) @@ -63,5 +62,4 @@ end cufy = (curnn(cufx); curnn(cufx)) @test fy ≈ collect(cufy) - end -end +end end diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl index 8024681a06..ea4452a47e 100644 --- a/test/cuda/layers.jl +++ b/test/cuda/layers.jl @@ -6,71 +6,70 @@ # generic movement tests @testset "Basic GPU Movement" begin - @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple - @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple + @test gradient(x -> sum(gpu(x)), rand(3, 3)) isa Tuple + @test gradient(x -> sum(cpu(x)), gpu(rand(3, 3))) isa Tuple end # TODO: These layers get into scalar indexing issues. const BROKEN_LAYERS = Union{} const ACTIVATIONS = [identity, relu, tanh, - sigmoid, exp, softplus, - elu, selu] - -function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; test_cpu = true) - isnothing(x_cpu) && error("Missing input to test the layers against.") - @testset "$name GPU grad tests" begin - for layer in layers - @testset "$layer Layer GPU grad test" begin - - # compute output and grad of parameters - l_cpu = layer(args...) - ps_cpu = Flux.params(l_cpu) - y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) - gs_cpu = back_cpu(1f0) - - x_gpu = gpu(x_cpu) - l_gpu = l_cpu |> gpu - ps_gpu = Flux.params(l_gpu) - - if typeof(l_gpu) <: BROKEN_LAYERS - @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads - else - y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) - gs_gpu = back_gpu(1f0) # TODO many layers error out when backprop int 1, should fix - - # compute grad of input - xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] - xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] - - # test - if test_cpu - @test y_gpu ≈ y_cpu rtol=1f-3 atol=1f-3 - if isnothing(xg_cpu) - @test isnothing(xg_gpu) + sigmoid, exp, softplus, + elu, selu] + +function gpu_gradtest(name::String, layers::Vector, x_cpu = nothing, args...; + test_cpu = true) + isnothing(x_cpu) && error("Missing input to test the layers against.") + @testset "$name GPU grad tests" begin for layer in layers + @testset "$layer Layer GPU grad test" begin + + # compute output and grad of parameters + l_cpu = layer(args...) + ps_cpu = Flux.params(l_cpu) + y_cpu, back_cpu = pullback(() -> sum(l_cpu(x_cpu)), ps_cpu) + gs_cpu = back_cpu(1.0f0) + + x_gpu = gpu(x_cpu) + l_gpu = l_cpu |> gpu + ps_gpu = Flux.params(l_gpu) + + if typeof(l_gpu) <: BROKEN_LAYERS + @test_broken gradient(() -> sum(l_gpu(x_gpu)), ps_gpu) isa Flux.Zygote.Grads else - if layer === GroupedConvTranspose - @test Array(xg_gpu) ≈ xg_cpu rtol = 2f-2 atol = 1f-3 - else - @test Array(xg_gpu) ≈ xg_cpu rtol = 1f-3 atol = 1f-3 - end + y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu) + gs_gpu = back_gpu(1.0f0) # TODO many layers error out when backprop int 1, should fix + + # compute grad of input + xg_cpu = gradient(x -> sum(l_cpu(x)), x_cpu)[1] + xg_gpu = gradient(x -> sum(l_gpu(x)), x_gpu)[1] + + # test + if test_cpu + @test y_gpu≈y_cpu rtol=1.0f-3 atol=1.0f-3 + if isnothing(xg_cpu) + @test isnothing(xg_gpu) + else + if layer === GroupedConvTranspose + @test Array(xg_gpu)≈xg_cpu rtol=2.0f-2 atol=1.0f-3 + else + @test Array(xg_gpu)≈xg_cpu rtol=1.0f-3 atol=1.0f-3 + end + end + end + @test gs_gpu isa Flux.Zygote.Grads + for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) + if isnothing(gs_cpu[p_cpu]) + @test isnothing(gs_gpu[p_gpu]) + else + @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray + if test_cpu + @test Array(gs_gpu[p_gpu])≈gs_cpu[p_cpu] rtol=1.0f-3 atol=1.0f-3 + end + end + end end - end - @test gs_gpu isa Flux.Zygote.Grads - for (p_cpu, p_gpu) in zip(ps_cpu, ps_gpu) - if isnothing(gs_cpu[p_cpu]) - @test isnothing(gs_gpu[p_gpu]) - else - @test gs_gpu[p_gpu] isa Flux.CUDA.CuArray - if test_cpu - @test Array(gs_gpu[p_gpu]) ≈ gs_cpu[p_cpu] rtol=1f-3 atol=1f-3 - end - end - end end - end - end - end + end end end # Just to give testset in gpu_gradtest meaningful labels @@ -82,44 +81,49 @@ GroupedConv(args...) = Conv(args..., groups = 5) GroupedConvTranspose(args...) = ConvTranspose(args..., groups = 5) for act in ACTIVATIONS - r = rand(Float32, 28, 28, 1, 1) - conv_layers = [Conv, ConvNoBias, - ConvTranspose, ConvTransposeNoBias, - CrossCor, CrossCorNoBias, - DepthwiseConv, DepthwiseConvNoBias] - gpu_gradtest("Convolution with $act", conv_layers, r, (2,2), 1=>3, act, test_cpu = false) - - groupedconv = [GroupedConv, GroupedConvTranspose] - gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), (3,3), 100 => 25, act, test_cpu = true) - - batch_norm = [BatchNorm] - gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28,28,3,4), 3, act, test_cpu = false) #TODO fix errors - gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5,4), 5, act, test_cpu = false) - - instancenorm = [InstanceNorm] - gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false) - - groupnorm = [GroupNorm] - gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28,28,3,1), 3, 1, act, test_cpu = false) + r = rand(Float32, 28, 28, 1, 1) + conv_layers = [Conv, ConvNoBias, + ConvTranspose, ConvTransposeNoBias, + CrossCor, CrossCorNoBias, + DepthwiseConv, DepthwiseConvNoBias] + gpu_gradtest("Convolution with $act", conv_layers, r, (2, 2), 1 => 3, act, + test_cpu = false) + + groupedconv = [GroupedConv, GroupedConvTranspose] + gpu_gradtest("GroupedConvolution with $act", groupedconv, rand(Float32, 28, 28, 100, 2), + (3, 3), 100 => 25, act, test_cpu = true) + + batch_norm = [BatchNorm] + gpu_gradtest("BatchNorm 1 with $act", batch_norm, rand(Float32, 28, 28, 3, 4), 3, act, + test_cpu = false) #TODO fix errors + gpu_gradtest("BatchNorm 2 with $act", batch_norm, rand(Float32, 5, 4), 5, act, + test_cpu = false) + + instancenorm = [InstanceNorm] + gpu_gradtest("InstanceNorm with $act", instancenorm, r, 1, act, test_cpu = false) + + groupnorm = [GroupNorm] + gpu_gradtest("GroupNorm with $act", groupnorm, rand(Float32, 28, 28, 3, 1), 3, 1, act, + test_cpu = false) end r = rand(Float32, 28, 28, 1, 1) pooling_layers = [MaxPool, MeanPool] -gpu_gradtest("Pooling", pooling_layers, r, (2,2)) +gpu_gradtest("Pooling", pooling_layers, r, (2, 2)) adaptive_pooling_layers = [AdaptiveMaxPool, AdaptiveMeanPool] -gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7,7), test_cpu = false) +gpu_gradtest("AdaptivePooling", adaptive_pooling_layers, r, (7, 7), test_cpu = false) dropout_layers = [Dropout, AlphaDropout] gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic layer_norm = [LayerNorm] -gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors -gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5) +gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28, 28, 3, 4), 1, test_cpu = false) #TODO fix errors +gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5, 4), 5) -upsample = [x -> Upsample(scale=x)] -gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2)) +upsample = [x -> Upsample(scale = x)] +gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2, 2)) gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,)) pixelshuffle = [PixelShuffle] @@ -127,166 +131,168 @@ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3) gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3) embedding = [Flux.Embedding] -gpu_gradtest("Embedding", embedding, [1,3,5], 5, 2) -gpu_gradtest("Embedding repeated indices", embedding, [1,3,5,3], 5, 2) +gpu_gradtest("Embedding", embedding, [1, 3, 5], 5, 2) +gpu_gradtest("Embedding repeated indices", embedding, [1, 3, 5, 3], 5, 2) gpu_gradtest("Embedding integer index", embedding, 1, 5, 2) gpu_gradtest("Embedding 2d index", embedding, [1 2; 3 4], 5, 2) gpu_gradtest("Embedding OneHotVec index", embedding, OneHotVector(1, 5), 5, 2) -gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1,2,3], 5), 5, 2) -gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, OneHotMatrix([1,2,2], 5), 5, 2) +gpu_gradtest("Embedding OneHotMatrix index", embedding, OneHotMatrix([1, 2, 3], 5), 5, 2) +gpu_gradtest("Embedding OneHotMatrix repeated indices", embedding, + OneHotMatrix([1, 2, 2], 5), 5, 2) @testset "function layers" begin - x = rand(Float32, 3,3) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x) - gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) + x = rand(Float32, 3, 3) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims = 1)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x; dims = 2)), x) + gpu_autodiff_test(x -> sum(Flux.normalise(x)), x) end @testset "Zeros mapped for $cl" for cl in (Conv, ConvTranspose, CrossCor, DepthwiseConv) - l = cl((2,2), 1=>3, bias = false) |> gpu - ip = zeros(Float32, 28,28,1,1) |> gpu - if typeof(l) <: BROKEN_LAYERS - @test_broken sum(l(ip)) ≈ 0.f0 - @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads - else - @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params - end + l = cl((2, 2), 1 => 3, bias = false) |> gpu + ip = zeros(Float32, 28, 28, 1, 1) |> gpu + if typeof(l) <: BROKEN_LAYERS + @test_broken sum(l(ip)) ≈ 0.0f0 + @test_broken gradient(() -> sum(l(ip)), Flux.params(l)) isa Flux.Zygote.Grads + else + @test sum(l(ip)) ≈ 0.0f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.bias ∉ gs.params + end end @testset "Dense without bias" begin - l = Dense(ones(Float32, 4, 3), false) |> gpu - ip = zeros(Float32, 3, 7) |> gpu + l = Dense(ones(Float32, 4, 3), false) |> gpu + ip = zeros(Float32, 3, 7) |> gpu - @test sum(l(ip)) ≈ 0.f0 - gs = gradient(() -> sum(l(ip)), Flux.params(l)) - @test l.bias ∉ gs.params + @test sum(l(ip)) ≈ 0.0f0 + gs = gradient(() -> sum(l(ip)), Flux.params(l)) + @test l.bias ∉ gs.params end @testset "Extended BatchNorm" begin - m_cpu = BatchNorm(2) - m_gpu = m_cpu |> gpu - x_cpu = rand(Float32, 3, 2, 2) - x_gpu = x_cpu |> gpu - - ## In :auto mode, track statistics only in gradient contest - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test !(m_cpu.μ ≈ μ_cpu) - - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test !(m_gpu.μ ≈ μ_gpu) - - @test Array(m_gpu.μ) ≈ m_cpu.μ - - ## In testmode, never track statistics - testmode!(m_cpu) - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test m_cpu.μ ≈ μ_cpu - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test m_cpu.μ ≈ μ_cpu - - testmode!(m_gpu) - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test m_gpu.μ ≈ μ_gpu - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test m_gpu.μ ≈ μ_gpu - - ## In trainmode, always track statistics - trainmode!(m_cpu) - μ_cpu = copy(m_cpu.μ) - m_cpu(x_cpu) - @test !(m_cpu.μ ≈ μ_cpu) - μ_cpu = copy(m_cpu.μ) - gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - @test !(m_cpu.μ ≈ μ_cpu) - - trainmode!(m_gpu) - μ_gpu = copy(m_gpu.μ) - m_gpu(x_gpu) - @test !(m_gpu.μ ≈ μ_gpu) - μ_gpu = copy(m_gpu.μ) - gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) - @test !(m_gpu.μ ≈ μ_gpu) - - ## No errors if input type mistmatch - # x_cpu = rand(Float64, 3, 2, 2) - # x_gpu = x_cpu |> gpu - # m_cpu(x_cpu) - # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) - # m_gpu(x_gpu) - # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + m_cpu = BatchNorm(2) + m_gpu = m_cpu |> gpu + x_cpu = rand(Float32, 3, 2, 2) + x_gpu = x_cpu |> gpu + + ## In :auto mode, track statistics only in gradient contest + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + @test Array(m_gpu.μ) ≈ m_cpu.μ + + ## In testmode, never track statistics + testmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test m_cpu.μ ≈ μ_cpu + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test m_cpu.μ ≈ μ_cpu + + testmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test m_gpu.μ ≈ μ_gpu + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test m_gpu.μ ≈ μ_gpu + + ## In trainmode, always track statistics + trainmode!(m_cpu) + μ_cpu = copy(m_cpu.μ) + m_cpu(x_cpu) + @test !(m_cpu.μ ≈ μ_cpu) + μ_cpu = copy(m_cpu.μ) + gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + @test !(m_cpu.μ ≈ μ_cpu) + + trainmode!(m_gpu) + μ_gpu = copy(m_gpu.μ) + m_gpu(x_gpu) + @test !(m_gpu.μ ≈ μ_gpu) + μ_gpu = copy(m_gpu.μ) + gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) + @test !(m_gpu.μ ≈ μ_gpu) + + ## No errors if input type mistmatch + # x_cpu = rand(Float64, 3, 2, 2) + # x_gpu = x_cpu |> gpu + # m_cpu(x_cpu) + # gradient(() -> sum(m_cpu(x_cpu)), Flux.params(m_cpu)) + # m_gpu(x_gpu) + # gradient(() -> sum(m_gpu(x_gpu)), Flux.params(m_gpu)) end @testset "Two-streams Bilinear" begin - x = zeros(Float32,10,9) |> gpu - y = zeros(Float32,2,9) |> gpu - b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end + x = zeros(Float32, 10, 9) |> gpu + y = zeros(Float32, 2, 9) |> gpu + b = Flux.Bilinear(10, 2, 3) |> gpu + @test size(b(x, y)) == (3, 9) + @test sum(abs2, b(x, y)) ≈ 0.0f0 + gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) + b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu + gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) + for (pgpu, pcpu) in zip(params(b), params(b_cpu)) + @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) + end end @testset "Two-streams Bilinear" begin - x = zeros(Float32,10,9) |> gpu - y = zeros(Float32,2,9) |> gpu - b = Flux.Bilinear(10, 2, 3) |> gpu - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) ≈ 0f0 - gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) - b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu - gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) - for (pgpu, pcpu) in zip(params(b), params(b_cpu)) - @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) - end + x = zeros(Float32, 10, 9) |> gpu + y = zeros(Float32, 2, 9) |> gpu + b = Flux.Bilinear(10, 2, 3) |> gpu + @test size(b(x, y)) == (3, 9) + @test sum(abs2, b(x, y)) ≈ 0.0f0 + gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b)) + b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu + gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu)) + for (pgpu, pcpu) in zip(params(b), params(b_cpu)) + @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu]) + end end @testset "Parallel" begin - @testset "zero sum" begin - input = randn(10, 10, 10, 10) |> gpu - layer_gpu = Parallel(+, zero, identity) |> gpu - @test layer_gpu(input) == input - @test layer_gpu(input) isa Flux.CUDA.CuArray - end - - @testset "vararg input" begin - inputs = (randn(10), randn(5), randn(4)) .|> gpu - layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu - @test size(layer(inputs)) == (2,) - end - - @testset "gradient" begin - input_cpu = randn(10, 10, 10, 10) - input_gpu = input_cpu |> gpu - layer_cpu = Parallel(+, x -> zero(x), identity) - layer_gpu = layer_cpu |> gpu - gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) - gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) - for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) - @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] + @testset "zero sum" begin + input = randn(10, 10, 10, 10) |> gpu + layer_gpu = Parallel(+, zero, identity) |> gpu + @test layer_gpu(input) == input + @test layer_gpu(input) isa Flux.CUDA.CuArray + end + + @testset "vararg input" begin + inputs = (randn(10), randn(5), randn(4)) .|> gpu + layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu + @test size(layer(inputs)) == (2,) + end + + @testset "gradient" begin + input_cpu = randn(10, 10, 10, 10) + input_gpu = input_cpu |> gpu + layer_cpu = Parallel(+, x -> zero(x), identity) + layer_gpu = layer_cpu |> gpu + gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu)) + gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu)) + for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu)) + @test gs_cpu[pcpu] ≈ gs_gpu[pgpu] + end end - end end @testset "Dropout RNGs" begin - @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), 0.1) - @testset for layer in (Dropout, AlphaDropout) - m = layer(0.1; rng = MersenneTwister(123)) - @test_throws ErrorException gpu(m) - m = layer(0.1; rng = CUDA.default_rng()) - @test gpu(m).rng isa CUDA.RNG - end + @test_throws ArgumentError Flux.dropout(MersenneTwister(), CUDA.rand(Float32, 2, 3), + 0.1) + @testset for layer in (Dropout, AlphaDropout) + m = layer(0.1; rng = MersenneTwister(123)) + @test_throws ErrorException gpu(m) + m = layer(0.1; rng = CUDA.default_rng()) + @test gpu(m).rng isa CUDA.RNG + end end diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl index a0f7f47d80..6777146b3d 100644 --- a/test/cuda/losses.jl +++ b/test/cuda/losses.jl @@ -1,38 +1,37 @@ -using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, binary_focal_loss, focal_loss - +using Flux.Losses: crossentropy, binarycrossentropy, logitbinarycrossentropy, + binary_focal_loss, focal_loss @testset "Losses" begin - -x = [1.,2.,3.] -cx = gpu(x) -@test crossentropy(x,x) ≈ crossentropy(cx,cx) -@test crossentropy(x,x, agg=identity) ≈ crossentropy(cx,cx, agg=identity) |> cpu -@test crossentropy(x,x, agg=x->mean([1.0;2.0;3.0].*x)) ≈ crossentropy(cx,cx, agg=x->mean(gpu([1.0;2.0;3.0]).*x)) - -x = [-1.1491, 0.8619, 0.3127] -y = [1, 1, 0.] -@test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) -@test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) - -x = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] -y = [0 1 0 - 1 0 1] -@test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) - -x = softmax(reshape(-7:7, 3, 5) .* 1f0) -y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] -@test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) - -@testset "GPU grad tests" begin - x = rand(Float32, 3,3) - y = rand(Float32, 3,3) - - for loss in ALL_LOSSES - gpu_autodiff_test(loss, x, y) - end -end - + x = [1.0, 2.0, 3.0] + cx = gpu(x) + @test crossentropy(x, x) ≈ crossentropy(cx, cx) + @test crossentropy(x, x, agg = identity) ≈ crossentropy(cx, cx, agg = identity) |> cpu + @test crossentropy(x, x, agg = x -> mean([1.0; 2.0; 3.0] .* x)) ≈ + crossentropy(cx, cx, agg = x -> mean(gpu([1.0; 2.0; 3.0]) .* x)) + + x = [-1.1491, 0.8619, 0.3127] + y = [1, 1, 0.0] + @test binarycrossentropy(σ.(x), y) ≈ binarycrossentropy(gpu(σ.(x)), gpu(y)) + @test logitbinarycrossentropy(x, y) ≈ logitbinarycrossentropy(gpu(x), gpu(y)) + + x = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] + y = [0 1 0 + 1 0 1] + @test binary_focal_loss(x, y) ≈ binary_focal_loss(gpu(x), gpu(y)) + + x = softmax(reshape(-7:7, 3, 5) .* 1.0f0) + y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] + @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y)) + + @testset "GPU grad tests" begin + x = rand(Float32, 3, 3) + y = rand(Float32, 3, 3) + + for loss in ALL_LOSSES + gpu_autodiff_test(loss, x, y) + end + end end #testset diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl index ebd32b1ec0..ca44286e48 100644 --- a/test/cuda/runtests.jl +++ b/test/cuda/runtests.jl @@ -12,9 +12,9 @@ include("losses.jl") include("layers.jl") if CUDA.has_cudnn() - @info "Testing Flux/CUDNN" - include("cudnn.jl") - include("curnn.jl") + @info "Testing Flux/CUDNN" + include("cudnn.jl") + include("curnn.jl") else - @warn "CUDNN unavailable, not testing GPU DNN support" + @warn "CUDNN unavailable, not testing GPU DNN support" end diff --git a/test/cuda/test_utils.jl b/test/cuda/test_utils.jl index bc0db37474..466b08c8b9 100644 --- a/test/cuda/test_utils.jl +++ b/test/cuda/test_utils.jl @@ -1,72 +1,75 @@ function check_grad(g_gpu, g_cpu, atol, rtol) - @show g_gpu g_cpu - @test false + @show g_gpu g_cpu + @test false +end +function check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) + return check_grad(g_gpu[], g_cpu[], atol, rtol) end -check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, rtol) = - check_grad(g_gpu[], g_cpu[], atol, rtol) check_grad(g_gpu::Nothing, g_cpu::Nothing, atol, rtol) = @test true -check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) = @test g_cpu ≈ g_gpu rtol=rtol atol=atol -check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) = - @test g_cpu ≈ collect(g_gpu) rtol=rtol atol=atol +function check_grad(g_gpu::Float32, g_cpu::Float32, atol, rtol) + @test g_cpu≈g_gpu rtol=rtol atol=atol +end +function check_grad(g_gpu::CuArray{Float32}, g_cpu::Array{Float32}, atol, rtol) + @test g_cpu≈collect(g_gpu) rtol=rtol atol=atol +end function check_grad(g_gpu::Tuple, g_cpu::Tuple, atol, rtol) - for (v1, v2) in zip(g_gpu, g_cpu) - check_grad(v1, v2, atol, rtol) - end + for (v1, v2) in zip(g_gpu, g_cpu) + check_grad(v1, v2, atol, rtol) + end end function check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple, atol, rtol) - for ((k1,v1), (k2,v2)) in zip(pairs(g_gpu), pairs(g_cpu)) - @test k1 == k2 - # @show k2 v2 - check_grad(v1, v2, atol, rtol) - end + for ((k1, v1), (k2, v2)) in zip(pairs(g_gpu), pairs(g_cpu)) + @test k1 == k2 + # @show k2 v2 + check_grad(v1, v2, atol, rtol) + end end -function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; - test_equal=true, rtol=1e-4, atol=1e-4) - - check_type(x) = false - check_type(x::Float32) = true - check_type(x::CuArray{Float32}) = true - check_type(x::Array{Float32}) = true +function gpu_autodiff_test(f_cpu, xs_cpu::Array{Float32}...; + test_equal = true, rtol = 1e-4, atol = 1e-4) + check_type(x) = false + check_type(x::Float32) = true + check_type(x::CuArray{Float32}) = true + check_type(x::Array{Float32}) = true - ### GRADIENT WITH RESPECT TO INPUT ##### - # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...) - y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) - @test check_type(y_cpu) - Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) - gs_cpu = back_cpu(Δ_cpu) + ### GRADIENT WITH RESPECT TO INPUT ##### + # y_cpu, back_cpu = pullback((f, x...) -> f(x...), f_cpu, xs_cpu...) + y_cpu, back_cpu = pullback((x...) -> f_cpu(x...), xs_cpu...) + @test check_type(y_cpu) + Δ_cpu = size(y_cpu) == () ? randn(Float32) : randn(Float32, size(y_cpu)) + gs_cpu = back_cpu(Δ_cpu) - f_gpu = f_cpu |> gpu - xs_gpu = gpu.(xs_cpu) - Δ_gpu = Δ_cpu |> gpu - # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...) - y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) - @test check_type(y_gpu) - gs_gpu = back_gpu(Δ_gpu) + f_gpu = f_cpu |> gpu + xs_gpu = gpu.(xs_cpu) + Δ_gpu = Δ_cpu |> gpu + # y_gpu, back_gpu = pullback((f, x...) -> f(x...), f_gpu, xs_gpu...) + y_gpu, back_gpu = pullback((x...) -> f_gpu(x...), xs_gpu...) + @test check_type(y_gpu) + gs_gpu = back_gpu(Δ_gpu) - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) - check_grad(g_gpu, g_cpu, atol, rtol) + if test_equal + @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol + for (g_gpu, g_cpu) in zip(gs_gpu, gs_cpu) + check_grad(g_gpu, g_cpu, atol, rtol) + end end - end - ### GRADIENT WITH RESPECT TO f ##### - ps_cpu = Flux.params(f_cpu) - y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) - gs_cpu = back_cpu(Δ_cpu) + ### GRADIENT WITH RESPECT TO f ##### + ps_cpu = Flux.params(f_cpu) + y_cpu, back_cpu = pullback(() -> f_cpu(xs_cpu...), ps_cpu) + gs_cpu = back_cpu(Δ_cpu) + + ps_gpu = Flux.params(f_gpu) + y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) + gs_gpu = back_gpu(Δ_gpu) - ps_gpu = Flux.params(f_gpu) - y_gpu, back_gpu = pullback(() -> f_gpu(xs_gpu...), ps_gpu) - gs_gpu = back_gpu(Δ_gpu) - - if test_equal - @test collect(y_cpu) ≈ collect(y_gpu) rtol=rtol atol=atol - @assert length(ps_gpu) == length(ps_cpu) - for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) - check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) + if test_equal + @test collect(y_cpu)≈collect(y_gpu) rtol=rtol atol=atol + @assert length(ps_gpu) == length(ps_cpu) + for (p_gpu, p_cpu) in zip(ps_gpu, ps_cpu) + check_grad(gs_gpu[p_gpu], gs_cpu[p_cpu], atol, rtol) + end end - end end diff --git a/test/data.jl b/test/data.jl index 4e4c485064..08d3a5809c 100644 --- a/test/data.jl +++ b/test/data.jl @@ -4,35 +4,35 @@ using Random X = reshape([1:10;], (2, 5)) Y = [1:5;] - d = DataLoader(X, batchsize=2) + d = DataLoader(X, batchsize = 2) # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == typeof(X) @test eltype(batches) == typeof(X) @test length(batches) == 3 - @test batches[1] == X[:,1:2] - @test batches[2] == X[:,3:4] - @test batches[3] == X[:,5:5] + @test batches[1] == X[:, 1:2] + @test batches[2] == X[:, 3:4] + @test batches[3] == X[:, 5:5] - d = DataLoader(X, batchsize=2, partial=false) + d = DataLoader(X, batchsize = 2, partial = false) # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == typeof(X) @test eltype(batches) == typeof(X) @test length(batches) == 2 - @test batches[1] == X[:,1:2] - @test batches[2] == X[:,3:4] + @test batches[1] == X[:, 1:2] + @test batches[2] == X[:, 3:4] - d = DataLoader((X,), batchsize=2, partial=false) + d = DataLoader((X,), batchsize = 2, partial = false) # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == Tuple{typeof(X)} @test eltype(batches) == Tuple{typeof(X)} @test length(batches) == 2 - @test batches[1] == (X[:,1:2],) - @test batches[2] == (X[:,3:4],) + @test batches[1] == (X[:, 1:2],) + @test batches[2] == (X[:, 3:4],) - d = DataLoader((X, Y), batchsize=2) + d = DataLoader((X, Y), batchsize = 2) # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == Tuple{typeof(X), typeof(Y)} @@ -41,41 +41,41 @@ using Random @test length(batches[1]) == 2 @test length(batches[2]) == 2 @test length(batches[3]) == 2 - @test batches[1][1] == X[:,1:2] + @test batches[1][1] == X[:, 1:2] @test batches[1][2] == Y[1:2] - @test batches[2][1] == X[:,3:4] + @test batches[2][1] == X[:, 3:4] @test batches[2][2] == Y[3:4] - @test batches[3][1] == X[:,5:5] + @test batches[3][1] == X[:, 5:5] @test batches[3][2] == Y[5:5] # test with NamedTuple - d = DataLoader((x=X, y=Y), batchsize=2) + d = DataLoader((x = X, y = Y), batchsize = 2) # @inferred first(d) batches = collect(d) # @test eltype(batches) == eltype(d) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} - @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} + @test eltype(batches) == NamedTuple{(:x, :y), Tuple{typeof(X), typeof(Y)}} @test length(batches) == 3 @test length(batches[1]) == 2 @test length(batches[2]) == 2 @test length(batches[3]) == 2 - @test batches[1][1] == batches[1].x == X[:,1:2] + @test batches[1][1] == batches[1].x == X[:, 1:2] @test batches[1][2] == batches[1].y == Y[1:2] - @test batches[2][1] == batches[2].x == X[:,3:4] + @test batches[2][1] == batches[2].x == X[:, 3:4] @test batches[2][2] == batches[2].y == Y[3:4] - @test batches[3][1] == batches[3].x == X[:,5:5] + @test batches[3][1] == batches[3].x == X[:, 5:5] @test batches[3][2] == batches[3].y == Y[5:5] # Don't mutate state https://github.com/FluxML/Flux.jl/issues/1227 - d = DataLoader([1:10;], shuffle=true) + d = DataLoader([1:10;], shuffle = true) cd = collect(zip(d, d)) # skip the first since it used to be different also before fixing the bug - @test [cd[i][1] for i=2:10] != [cd[i][2] for i=2:10] - + @test [cd[i][1] for i in 2:10] != [cd[i][2] for i in 2:10] + # test interaction with `train!` θ = ones(2) X = zeros(2, 10) - loss(x) = sum((x .- θ).^2) - d = DataLoader(X) + loss(x) = sum((x .- θ) .^ 2) + d = DataLoader(X) Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1)) @test norm(θ) < 1e-4 @@ -83,11 +83,13 @@ using Random θ = zeros(2) X = ones(2, 10) Y = fill(2, 10) - loss(x, y) = sum((y - x'*θ).^2) - d = DataLoader((X, Y)) + loss(x, y) = sum((y - x' * θ) .^ 2) + d = DataLoader((X, Y)) Flux.train!(loss, Params([θ]), ncycle(d, 10), Descent(0.1)) @test norm(θ .- 1) < 1e-10 # specify the rng - d = map(identity, DataLoader(X, batchsize=2; shuffle=true, rng=Random.seed!(Random.default_rng(), 5))) + d = map(identity, + DataLoader(X, batchsize = 2; shuffle = true, + rng = Random.seed!(Random.default_rng(), 5))) end diff --git a/test/layers/basic.jl b/test/layers/basic.jl index d66aad4f56..b7cfd1aa09 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -2,370 +2,386 @@ using Test, Random import Flux: activations @testset "basic" begin - @testset "helpers" begin - @testset "activations" begin - dummy_model = Chain(x->x.^2, x->x .- 3, x -> tan.(x)) - x = randn(10) - @test activations(dummy_model, x)[1] == x.^2 - @test activations(dummy_model, x)[2] == (x.^2 .- 3) - @test activations(dummy_model, x)[3] == tan.(x.^2 .- 3) - - @test activations(Chain(), x) == () - @test activations(Chain(identity, x->:foo), x)[2] == :foo # results include `Any` type - end - end - - @testset "Chain" begin - @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) - @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10)) - # numeric test should be put into testset of corresponding layer - - @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10)) - m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2)) - @test m[:first] == m[1] - @test m[1:2] == m - - @test m == m - @test m == fmap(identity, m) # does not forget names - - @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name - - @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10)) # vector of layers - - c = Chain(Dense(10, 5, σ), Dense(5, 2), Dense(2, 1, relu)) - @test c[1] == c[begin] - @test c[3] == c[end] - end - - @testset "Activations" begin - c = Chain(Dense(3,5,relu), Dense(5,1,relu)) - X = Float32.([1.0; 1.0; 1.0]) - @test_nowarn gradient(()->Flux.activations(c, X)[2][1], Flux.params(c)) - - c2 = Chain(enc = c[1], dec = c[2]) - @test Flux.activations(c, X) == Flux.activations(c2, X) - @test_nowarn gradient(()->Flux.activations(c2, X)[2][1], Flux.params(c2)) - end - - @testset "Dense" begin - @testset "constructors" begin - @test size(Dense(10, 100).weight) == (100, 10) - @test size(Dense(10, 100).bias) == (100,) - @test Dense(rand(100,10), rand(100)).σ == identity - @test Dense(rand(100,10)).σ == identity - - @test Dense(rand(100,10), false).σ == identity - @test Dense(rand(100,10), false, tanh).σ == tanh - @test Dense(rand(100,10), rand(100)).σ == identity - @test Dense(rand(Float16, 100,10), true).bias isa Vector{Float16} # creates matching type - @test_skip Dense(rand(Float16, 100,10), rand(100)).bias isa Vector{Float16} # converts to match - - @test Dense(3,4; init=Base.randn, bias=true).bias isa Vector{Float64} - @test_skip Dense(3,4; init=Base.randn, bias=[1,2,3,4]).bias isa Vector{Float64} - - @test_throws MethodError Dense(10, 10.5) - @test_throws MethodError Dense(10, 10.5, tanh) - @test_throws DimensionMismatch Dense(3,4; bias=rand(5)) - @test_throws DimensionMismatch Dense(rand(4,3), rand(5)) - @test_throws MethodError Dense(rand(5)) - @test_throws MethodError Dense(rand(5), rand(5)) - @test_throws MethodError Dense(rand(5), rand(5), tanh) - end - @testset "dimensions" begin - @test length(Dense(10, 5)(randn(10))) == 5 - @test_throws DimensionMismatch Dense(10, 5)(randn(1)) - @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting - @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting - @test size(Dense(10, 5)(randn(10))) == (5,) - @test size(Dense(10, 5)(randn(10,2))) == (5,2) - @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3) - @test size(Dense(10, 5)(randn(10,2,3,4))) == (5,2,3,4) - @test_throws DimensionMismatch Dense(10, 5)(randn(11,2,3)) - end - @testset "zeros" begin - @test Dense(10, 1, identity, init = ones)(ones(10,1)) == 10*ones(1, 1) - @test Dense(10, 1, identity, init = ones)(ones(10,2)) == 10*ones(1, 2) - @test Dense(10, 2, identity, init = ones)(ones(10,1)) == 10*ones(2, 1) - @test Dense(10, 2, identity, init = ones)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] - @test Dense(10, 2, identity, init = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20] - end - end - - @testset "Scale" begin - @test length(Flux.Scale(10)(randn(10))) == 10 - @test length(Flux.Scale(10)(randn(1))) == 10 - @test length(Flux.Scale(10; bias = false)(randn(10))) == 10 - @test length(Flux.Scale(10, tanh)(randn(10))) == 10 - @test_throws DimensionMismatch Flux.Scale(10)(randn(2)) - - @test Flux.Scale(2)([1 2]) == [1 2; 1 2] - @test Flux.Scale(2)([1, 2]) == [1, 2] - @test Flux.Scale(2; init = randn)([1, 2]) != [1, 2] - @test Flux.Scale(2; bias = false)([1 2; 3 4]) == [1 2; 3 4] - @test Flux.Scale(2, abs2; bias = false, init = ones)([1 2; 3 4]) == [1 4; 9 16] - - @test Flux.Scale(2)(rand(2, 3, 4)) |> size == (2, 3, 4) - @test Flux.Scale(2, 3;)(rand(2, 3, 4)) |> size == (2, 3, 4) - @test Flux.Scale(2, 3, 4; bias = false)(rand(2, 3, 4)) |> size == (2, 3, 4) - @test Flux.Scale(2, 3; bias = false)(rand(2, 1, 4)) |> size == (2, 3, 4) - @test Flux.Scale(2, 3, tanh; bias = false, init = zeros)(rand(2, 1, 4)) == zeros(2, 3, 4) - - @test_throws MethodError Flux.Scale(1.) - @test_throws MethodError Flux.Scale(1., 2.) - @test_throws Exception Flux.Scale() - @test_throws MethodError Flux.Scale(sin) - end - - @testset "Maxout" begin - # Note that the normal common usage of Maxout is as per the docstring - # These are abnormal constructors used for testing purposes - - @testset "Constructor" begin - mo = Maxout(() -> identity, 4) - input = rand(40) - @test mo(input) == input - end + @testset "helpers" begin @testset "activations" begin + dummy_model = Chain(x -> x .^ 2, x -> x .- 3, x -> tan.(x)) + x = randn(10) + @test activations(dummy_model, x)[1] == x .^ 2 + @test activations(dummy_model, x)[2] == (x .^ 2 .- 3) + @test activations(dummy_model, x)[3] == tan.(x .^ 2 .- 3) - @testset "simple alternatives" begin - mo = Maxout(x -> x, x -> 2x, x -> 0.5x) - input = rand(40) - @test mo(input) == 2*input - end + @test activations(Chain(), x) == () + @test activations(Chain(identity, x -> :foo), x)[2] == :foo # results include `Any` type + end end - @testset "complex alternatives" begin - mo = Maxout(x -> [0.5; 0.1]*x, x -> [0.2; 0.7]*x) - input = [3.0 2.0] - target = [0.5, 0.7].*input - @test mo(input) == target - end + @testset "Chain" begin + @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) + @test_throws DimensionMismatch Chain(Dense(10, 5, σ), Dense(2, 1))(randn(10)) + # numeric test should be put into testset of corresponding layer - @testset "params" begin - mo = Maxout(()->Dense(32, 64), 4) - ps = Flux.params(mo) - @test length(ps) == 8 #4 alts, each with weight and bias - end - end + @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10)) + m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2)) + @test m[:first] == m[1] + @test m[1:2] == m - @testset "SkipConnection" begin - @testset "zero sum" begin - input = randn(10, 10, 10, 10) - @test SkipConnection(x -> zeros(size(x)), (a,b) -> a + b)(input) == input - end + @test m == m + @test m == fmap(identity, m) # does not forget names - @testset "concat size" begin - input = randn(10, 2) - @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4) - end - end - - @testset "Bilinear" begin - @testset "SkipConnection recombinator" begin - d = Dense(10, 10) - b = Flux.Bilinear(10, 10, 5) - x = randn(Float32,10,9) - sc = SkipConnection(d, b) - @test size(sc(x)) == (5,9) - end + @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name - @testset "Two-streams zero sum" begin - x = zeros(Float32,10,9) - y = zeros(Float32,2,9) - b = Flux.Bilinear(10, 2, 3) - @test size(b(x,y)) == (3,9) - @test sum(abs2, b(x,y)) == 0f0 - end + @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10)) # vector of layers - @testset "Inner interactions" begin - x = randn(Float32,11,7) - b = Flux.Bilinear(11, 11, 3) - @test size(b(x)) == (3,7) - @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b)) + c = Chain(Dense(10, 5, σ), Dense(5, 2), Dense(2, 1, relu)) + @test c[1] == c[begin] + @test c[3] == c[end] end - @testset "constructors" begin - b1 = Flux.Bilinear(randn(3,4,5)) - @test b1.bias isa Vector{Float64} - @test b1.σ == identity - - b2 = Flux.Bilinear(randn(3,4,5), false) - @test b2.bias === false - - b3 = Flux.Bilinear(randn(Float16, 3,4,5), true, tanh) - @test b3.σ == tanh - @test b3.bias isa Vector{Float16} - @test size(b3(rand(4), rand(5))) == (3,) + @testset "Activations" begin + c = Chain(Dense(3, 5, relu), Dense(5, 1, relu)) + X = Float32.([1.0; 1.0; 1.0]) + @test_nowarn gradient(() -> Flux.activations(c, X)[2][1], Flux.params(c)) - b4 = Flux.Bilinear(3,3,7; bias=1:7, init=Flux.zeros32) - @test_skip b4.bias isa Vector{Float32} - - @test_throws ArgumentError Flux.Bilinear(rand(3)) # expects a 3-array - @test_throws ArgumentError Flux.Bilinear(rand(3,4), false, tanh) - @test_throws DimensionMismatch Flux.Bilinear(rand(3,4,5), rand(6), tanh) # wrong length bias + c2 = Chain(enc = c[1], dec = c[2]) + @test Flux.activations(c, X) == Flux.activations(c2, X) + @test_nowarn gradient(() -> Flux.activations(c2, X)[2][1], Flux.params(c2)) end - end - @testset "Parallel" begin - @testset "zero sum" begin - input = randn(10, 10, 10, 10) - @test Parallel(+, x -> zeros(size(x)), identity)(input) == input + @testset "Dense" begin + @testset "constructors" begin + @test size(Dense(10, 100).weight) == (100, 10) + @test size(Dense(10, 100).bias) == (100,) + @test Dense(rand(100, 10), rand(100)).σ == identity + @test Dense(rand(100, 10)).σ == identity + + @test Dense(rand(100, 10), false).σ == identity + @test Dense(rand(100, 10), false, tanh).σ == tanh + @test Dense(rand(100, 10), rand(100)).σ == identity + @test Dense(rand(Float16, 100, 10), true).bias isa Vector{Float16} # creates matching type + @test_skip Dense(rand(Float16, 100, 10), rand(100)).bias isa Vector{Float16} # converts to match + + @test Dense(3, 4; init = Base.randn, bias = true).bias isa Vector{Float64} + @test_skip Dense(3, 4; init = Base.randn, bias = [1, 2, 3, 4]).bias isa + Vector{Float64} + + @test_throws MethodError Dense(10, 10.5) + @test_throws MethodError Dense(10, 10.5, tanh) + @test_throws DimensionMismatch Dense(3, 4; bias = rand(5)) + @test_throws DimensionMismatch Dense(rand(4, 3), rand(5)) + @test_throws MethodError Dense(rand(5)) + @test_throws MethodError Dense(rand(5), rand(5)) + @test_throws MethodError Dense(rand(5), rand(5), tanh) + end + @testset "dimensions" begin + @test length(Dense(10, 5)(randn(10))) == 5 + @test_throws DimensionMismatch Dense(10, 5)(randn(1)) + @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting + @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting + @test size(Dense(10, 5)(randn(10))) == (5,) + @test size(Dense(10, 5)(randn(10, 2))) == (5, 2) + @test size(Dense(10, 5)(randn(10, 2, 3))) == (5, 2, 3) + @test size(Dense(10, 5)(randn(10, 2, 3, 4))) == (5, 2, 3, 4) + @test_throws DimensionMismatch Dense(10, 5)(randn(11, 2, 3)) + end + @testset "zeros" begin + @test Dense(10, 1, identity, init = ones)(ones(10, 1)) == 10 * ones(1, 1) + @test Dense(10, 1, identity, init = ones)(ones(10, 2)) == 10 * ones(1, 2) + @test Dense(10, 2, identity, init = ones)(ones(10, 1)) == 10 * ones(2, 1) + @test Dense(10, 2, identity, init = ones)([ones(10, 1) 2 * ones(10, 1)]) == + [10 20; 10 20] + @test Dense(10, 2, identity, init = ones, bias = false)([ones(10, 1) 2 * + ones(10, + 1)]) == + [10 20; 10 20] + end end - @testset "concat size" begin - input = randn(10, 2) - @test size(Parallel((a, b) -> cat(a, b; dims=2), Dense(10, 10), identity)(input)) == (10, 4) - @test size(Parallel(hcat, one = Dense(10, 10), two = identity)(input)) == (10, 4) + @testset "Scale" begin + @test length(Flux.Scale(10)(randn(10))) == 10 + @test length(Flux.Scale(10)(randn(1))) == 10 + @test length(Flux.Scale(10; bias = false)(randn(10))) == 10 + @test length(Flux.Scale(10, tanh)(randn(10))) == 10 + @test_throws DimensionMismatch Flux.Scale(10)(randn(2)) + + @test Flux.Scale(2)([1 2]) == [1 2; 1 2] + @test Flux.Scale(2)([1, 2]) == [1, 2] + @test Flux.Scale(2; init = randn)([1, 2]) != [1, 2] + @test Flux.Scale(2; bias = false)([1 2; 3 4]) == [1 2; 3 4] + @test Flux.Scale(2, abs2; bias = false, init = ones)([1 2; 3 4]) == [1 4; 9 16] + + @test Flux.Scale(2)(rand(2, 3, 4)) |> size == (2, 3, 4) + @test Flux.Scale(2, 3;)(rand(2, 3, 4)) |> size == (2, 3, 4) + @test Flux.Scale(2, 3, 4; bias = false)(rand(2, 3, 4)) |> size == (2, 3, 4) + @test Flux.Scale(2, 3; bias = false)(rand(2, 1, 4)) |> size == (2, 3, 4) + @test Flux.Scale(2, 3, tanh; bias = false, init = zeros)(rand(2, 1, 4)) == + zeros(2, 3, 4) + + @test_throws MethodError Flux.Scale(1.0) + @test_throws MethodError Flux.Scale(1.0, 2.0) + @test_throws Exception Flux.Scale() + @test_throws MethodError Flux.Scale(sin) end - @testset "vararg input" begin - inputs = randn(10), randn(5), randn(4) - @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,) - @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == (2,) - @test_throws ArgumentError Parallel(+, sin, cos)(1,2,3) # wrong number of inputs - @test Parallel(+, sin, cos)(pi/2) ≈ 1 + @testset "Maxout" begin + # Note that the normal common usage of Maxout is as per the docstring + # These are abnormal constructors used for testing purposes + + @testset "Constructor" begin + mo = Maxout(() -> identity, 4) + input = rand(40) + @test mo(input) == input + end + + @testset "simple alternatives" begin + mo = Maxout(x -> x, x -> 2x, x -> 0.5x) + input = rand(40) + @test mo(input) == 2 * input + end + + @testset "complex alternatives" begin + mo = Maxout(x -> [0.5; 0.1] * x, x -> [0.2; 0.7] * x) + input = [3.0 2.0] + target = [0.5, 0.7] .* input + @test mo(input) == target + end + + @testset "params" begin + mo = Maxout(() -> Dense(32, 64), 4) + ps = Flux.params(mo) + @test length(ps) == 8 #4 alts, each with weight and bias + end end - @testset "named access" begin - m = Parallel(hcat, one = Dense(10, 10), two = identity) - @test m[1] == m[:one] - @test m[1:2] == m - - @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10), two = identity) # reserved names - @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10), two = identity) - - @test m == fmap(identity, m) # does not forget names - - @test Parallel(vcat, x = log)(1) == [0] - @test Parallel(vcat, log)(1) == [0] + @testset "SkipConnection" begin + @testset "zero sum" begin + input = randn(10, 10, 10, 10) + @test SkipConnection(x -> zeros(size(x)), (a, b) -> a + b)(input) == input + end + + @testset "concat size" begin + input = randn(10, 2) + @test size(SkipConnection(Dense(10, 10), (a, b) -> cat(a, b, dims = 2))(input)) == + (10, 4) + end end - @testset "trivial cases" begin - @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}} # not a NamedTuple - @test Parallel(hcat)(1) == hcat() - @test Parallel(hcat, inv)(2) == hcat(1/2) # still calls connection once. + @testset "Bilinear" begin + @testset "SkipConnection recombinator" begin + d = Dense(10, 10) + b = Flux.Bilinear(10, 10, 5) + x = randn(Float32, 10, 9) + sc = SkipConnection(d, b) + @test size(sc(x)) == (5, 9) + end + + @testset "Two-streams zero sum" begin + x = zeros(Float32, 10, 9) + y = zeros(Float32, 2, 9) + b = Flux.Bilinear(10, 2, 3) + @test size(b(x, y)) == (3, 9) + @test sum(abs2, b(x, y)) == 0.0f0 + end + + @testset "Inner interactions" begin + x = randn(Float32, 11, 7) + b = Flux.Bilinear(11, 11, 3) + @test size(b(x)) == (3, 7) + @test_nowarn gs = gradient(() -> sum(abs2.(b(x))), params(b)) + end + + @testset "constructors" begin + b1 = Flux.Bilinear(randn(3, 4, 5)) + @test b1.bias isa Vector{Float64} + @test b1.σ == identity + + b2 = Flux.Bilinear(randn(3, 4, 5), false) + @test b2.bias === false + + b3 = Flux.Bilinear(randn(Float16, 3, 4, 5), true, tanh) + @test b3.σ == tanh + @test b3.bias isa Vector{Float16} + @test size(b3(rand(4), rand(5))) == (3,) + + b4 = Flux.Bilinear(3, 3, 7; bias = 1:7, init = Flux.zeros32) + @test_skip b4.bias isa Vector{Float32} + + @test_throws ArgumentError Flux.Bilinear(rand(3)) # expects a 3-array + @test_throws ArgumentError Flux.Bilinear(rand(3, 4), false, tanh) + @test_throws DimensionMismatch Flux.Bilinear(rand(3, 4, 5), rand(6), tanh) # wrong length bias + end end - @testset "connection is called once" begin - CNT = Ref(0) - f_cnt = (x...) -> (CNT[]+=1; +(x...)) - Parallel(f_cnt, sin, cos, tan)(1) - @test CNT[] == 1 - Parallel(f_cnt, sin, cos, tan)(1,2,3) - @test CNT[] == 2 - Parallel(f_cnt, sin)(1) - @test CNT[] == 3 + @testset "Parallel" begin + @testset "zero sum" begin + input = randn(10, 10, 10, 10) + @test Parallel(+, x -> zeros(size(x)), identity)(input) == input + end + + @testset "concat size" begin + input = randn(10, 2) + @test size(Parallel((a, b) -> cat(a, b; dims = 2), Dense(10, 10), identity)(input)) == + (10, 4) + @test size(Parallel(hcat, one = Dense(10, 10), two = identity)(input)) == + (10, 4) + end + + @testset "vararg input" begin + inputs = randn(10), randn(5), randn(4) + @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,) + @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == + (2,) + @test_throws ArgumentError Parallel(+, sin, cos)(1, 2, 3) # wrong number of inputs + @test Parallel(+, sin, cos)(pi / 2) ≈ 1 + end + + @testset "named access" begin + m = Parallel(hcat, one = Dense(10, 10), two = identity) + @test m[1] == m[:one] + @test m[1:2] == m + + @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10), + two = identity) # reserved names + @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10), + two = identity) + + @test m == fmap(identity, m) # does not forget names + + @test Parallel(vcat, x = log)(1) == [0] + @test Parallel(vcat, log)(1) == [0] + end + + @testset "trivial cases" begin + @test Parallel(hcat) isa Parallel{typeof(hcat), Tuple{}} # not a NamedTuple + @test Parallel(hcat)(1) == hcat() + @test Parallel(hcat, inv)(2) == hcat(1 / 2) # still calls connection once. + end + + @testset "connection is called once" begin + CNT = Ref(0) + f_cnt = (x...) -> (CNT[] += 1; +(x...)) + Parallel(f_cnt, sin, cos, tan)(1) + @test CNT[] == 1 + Parallel(f_cnt, sin, cos, tan)(1, 2, 3) + @test CNT[] == 2 + Parallel(f_cnt, sin)(1) + @test CNT[] == 3 + end + + # Ref https://github.com/FluxML/Flux.jl/issues/1673 + @testset "Input domain" begin + struct Input + x::Any + end + + struct L1 + x::Any + end + (l::L1)(x) = l.x * x + Flux.@functor L1 + Base.:*(a::AbstractArray, b::Input) = a * b.x + + par = Parallel(+, L1(rand(Float32, 3, 3)), L1(rand(Float32, 3, 3))) + ip = Input(rand(Float32, 3, 3)) + ip2 = Input(rand(Float32, 3, 3)) + + @test par(ip) ≈ par.layers[1](ip.x) + par.layers[2](ip.x) + @test par(ip, ip2) ≈ par.layers[1](ip.x) + par.layers[2](ip2.x) + gs = gradient((par, x...) -> sum(par(x...)), par, ip, ip2) + gs_reg = gradient(par, ip, ip2) do par, x, y + return sum(par.layers[1](x.x) + par.layers[2](y.x)) + end + + for (a, b) in zip(gs[1].layers, gs_reg[1].layers) + @test a.x ≈ b.x + end + @test gs[2].x ≈ gs_reg[2].x + @test gs[3].x ≈ gs_reg[3].x + end end - # Ref https://github.com/FluxML/Flux.jl/issues/1673 - @testset "Input domain" begin - struct Input - x - end - - struct L1 - x - end - (l::L1)(x) = l.x * x - Flux.@functor L1 - Base.:*(a::AbstractArray, b::Input) = a * b.x - - par = Parallel(+, L1(rand(Float32, 3,3)), L1(rand(Float32, 3,3))) - ip = Input(rand(Float32, 3,3)) - ip2 = Input(rand(Float32, 3,3)) - - @test par(ip) ≈ par.layers[1](ip.x) + par.layers[2](ip.x) - @test par(ip, ip2) ≈ par.layers[1](ip.x) + par.layers[2](ip2.x) - gs = gradient((par, x...) -> sum(par(x...)), par, ip, ip2) - gs_reg = gradient(par, ip, ip2) do par, x, y - sum(par.layers[1](x.x) + par.layers[2](y.x)) - end - - for (a,b) in zip(gs[1].layers, gs_reg[1].layers) - @test a.x ≈ b.x - end - @test gs[2].x ≈ gs_reg[2].x - @test gs[3].x ≈ gs_reg[3].x + @testset "Embedding" begin + vocab_size, embed_size = 10, 4 + m = Flux.Embedding(vocab_size, embed_size) + @test size(m.weight) == (embed_size, vocab_size) + + x = rand(1:vocab_size, 3) + y = m(x) + @test y isa Matrix{Float32} + @test y ≈ m.weight[:, x] + x2 = OneHotMatrix(x, vocab_size) + y2 = m(x2) + @test y2 isa Matrix{Float32} + @test y2 ≈ y + @test_throws DimensionMismatch m(OneHotMatrix(x, 1000)) + + x = rand(1:vocab_size, 3, 4) + y = m(x) + @test y isa Array{Float32, 3} + @test size(y) == (embed_size, 3, 4) + + @test m(2) ≈ m.weight[:, 2] + @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:, 3] + @test_throws DimensionMismatch m(OneHotVector(3, 1000)) end - end - - @testset "Embedding" begin - vocab_size, embed_size = 10, 4 - m = Flux.Embedding(vocab_size, embed_size) - @test size(m.weight) == (embed_size, vocab_size) - - x = rand(1:vocab_size, 3) - y = m(x) - @test y isa Matrix{Float32} - @test y ≈ m.weight[:,x] - x2 = OneHotMatrix(x, vocab_size) - y2 = m(x2) - @test y2 isa Matrix{Float32} - @test y2 ≈ y - @test_throws DimensionMismatch m(OneHotMatrix(x, 1000)) - - x = rand(1:vocab_size, 3, 4) - y = m(x) - @test y isa Array{Float32, 3} - @test size(y) == (embed_size, 3, 4) - - @test m(2) ≈ m.weight[:,2] - @test m(OneHotVector(3, vocab_size)) ≈ m.weight[:,3] - @test_throws DimensionMismatch m(OneHotVector(3, 1000)) - end end @testset "second derivatives" begin - m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2)) - @test Zygote.hessian_dual(sum∘m1, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1, [1,2,3]) - - m1v = Chain([m1[1], m1[2]]) # vector of layers - @test Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_dual(sum∘m1, [1,2,3]) - @test_broken Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1v, [1,2,3]) - - # NNlib's softmax gradient writes in-place - m2 = Chain(Dense(3,4,tanh), Dense(4,2), softmax) - @test_broken Zygote.hessian_dual(sum∘m2, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m2, [1,2,3]) - - # https://github.com/FluxML/NNlib.jl/issues/362 - m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2,2)) - x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims=3) - @test Zygote.hessian_dual(sum∘m3, x3) ≈ Zygote.hessian_reverse(sum∘m3, x3) + m1 = Chain(Dense(3, 4, tanh; bias = false), Dense(4, 2)) + @test Zygote.hessian_dual(sum ∘ m1, [1, 2, 3]) ≈ + Zygote.hessian_reverse(sum ∘ m1, [1, 2, 3]) + + m1v = Chain([m1[1], m1[2]]) # vector of layers + @test Zygote.hessian_dual(sum ∘ m1v, [1, 2, 3]) ≈ + Zygote.hessian_dual(sum ∘ m1, [1, 2, 3]) + @test_broken Zygote.hessian_dual(sum ∘ m1v, [1, 2, 3]) ≈ + Zygote.hessian_reverse(sum ∘ m1v, [1, 2, 3]) + + # NNlib's softmax gradient writes in-place + m2 = Chain(Dense(3, 4, tanh), Dense(4, 2), softmax) + @test_broken Zygote.hessian_dual(sum ∘ m2, [1, 2, 3]) ≈ + Zygote.hessian_reverse(sum ∘ m2, [1, 2, 3]) + + # https://github.com/FluxML/NNlib.jl/issues/362 + m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2, 2)) + x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims = 3) + @test Zygote.hessian_dual(sum ∘ m3, x3) ≈ Zygote.hessian_reverse(sum ∘ m3, x3) end @testset "gradients of Chain{Vector}" begin - m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2)) - m1v = Chain([m1[1], m1[2]]) - @test sum(length, params(m1)) == sum(length, params(m1v)) - - x1 = randn(Float32,3,5) - @test m1(x1) ≈ m1v(x1) - - y1 = rand(Bool,2,5) - g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1)) - g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v)) - @test g1[m1[1].weight] ≈ g1v[m1v[1].weight] - @test g1[m1[2].bias] ≈ g1v[m1v[2].bias] - - @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1] - z1 = rand(22); - @test Flux.destructure(m1)[2](z1)[1].weight ≈ Flux.destructure(m1v)[2](z1)[1].weight - # Note that Flux.destructure(m1v)[2](z) has a Chain{Tuple}, as does m1v[1:2] + m1 = Chain(Dense(3, 4, tanh; bias = false), Dense(4, 2)) + m1v = Chain([m1[1], m1[2]]) + @test sum(length, params(m1)) == sum(length, params(m1v)) + + x1 = randn(Float32, 3, 5) + @test m1(x1) ≈ m1v(x1) + + y1 = rand(Bool, 2, 5) + g1 = gradient(() -> Flux.Losses.logitcrossentropy(m1(x1), y1), params(m1)) + g1v = gradient(() -> Flux.Losses.logitcrossentropy(m1v(x1), y1), params(m1v)) + @test g1[m1[1].weight] ≈ g1v[m1v[1].weight] + @test g1[m1[2].bias] ≈ g1v[m1v[2].bias] + + @test Flux.destructure(m1)[1] ≈ Flux.destructure(m1v)[1] + z1 = rand(22) + @test Flux.destructure(m1)[2](z1)[1].weight ≈ Flux.destructure(m1v)[2](z1)[1].weight + # Note that Flux.destructure(m1v)[2](z) has a Chain{Tuple}, as does m1v[1:2] end @testset "PairwiseFusion" begin - x = (rand(1, 10), rand(30, 10)) - layer = PairwiseFusion(+, Dense(1, 30), Dense(30, 10)) - y = layer(x) - @test length(y) == 2 - @test size(y[1]) == (30, 10) - @test size(y[2]) == (10, 10) - - x = rand(1, 10) - layer = PairwiseFusion(.+, Dense(1, 10), Dense(10, 1)) - y = layer(x) - @test length(y) == 2 - @test size(y[1]) == (10, 10) - @test size(y[2]) == (1, 10) - - @test PairwiseFusion(vcat, x->x.+1, x->x.+2, x->x.^3)(2, 10, 20) == (3, [5, 12], [125, 1728, 8000]) - @test PairwiseFusion(vcat, x->x.+1, x->x.+2, x->x.^3)(7) == (8, [10, 9], [1000, 729, 343]) + x = (rand(1, 10), rand(30, 10)) + layer = PairwiseFusion(+, Dense(1, 30), Dense(30, 10)) + y = layer(x) + @test length(y) == 2 + @test size(y[1]) == (30, 10) + @test size(y[2]) == (10, 10) + + x = rand(1, 10) + layer = PairwiseFusion(.+, Dense(1, 10), Dense(10, 1)) + y = layer(x) + @test length(y) == 2 + @test size(y[1]) == (10, 10) + @test size(y[2]) == (1, 10) + + @test PairwiseFusion(vcat, x -> x .+ 1, x -> x .+ 2, x -> x .^ 3)(2, 10, 20) == + (3, [5, 12], [125, 1728, 8000]) + @test PairwiseFusion(vcat, x -> x .+ 1, x -> x .+ 2, x -> x .^ 3)(7) == + (8, [10, 9], [1000, 729, 343]) end diff --git a/test/layers/conv.jl b/test/layers/conv.jl index 019f3fd603..85ed0cb0f4 100644 --- a/test/layers/conv.jl +++ b/test/layers/conv.jl @@ -3,115 +3,114 @@ using Flux: maxpool, meanpool using Flux: gradient @testset "Pooling" begin - x = randn(Float32, 10, 10, 3, 2) - y = randn(Float32, 20, 20, 3, 2) - ampx = AdaptiveMaxPool((5,5)) - @test ampx(x) == maxpool(x, PoolDims(x, 2)) - ampx = AdaptiveMeanPool((5,5)) - @test ampx(x) == meanpool(x, PoolDims(x, 2)) - ampy = AdaptiveMaxPool((10, 5)) - @test ampy(y) == maxpool(y, PoolDims(y, (2, 4))) - ampy = AdaptiveMeanPool((10, 5)) - @test ampy(y) == meanpool(y, PoolDims(y, (2, 4))) - gmp = GlobalMaxPool() - @test size(gmp(x)) == (1, 1, 3, 2) - gmp = GlobalMeanPool() - @test size(gmp(x)) == (1, 1, 3, 2) - mp = MaxPool((2, 2)) - @test mp(x) == maxpool(x, PoolDims(x, 2)) - mp = MeanPool((2, 2)) - @test mp(x) == meanpool(x, PoolDims(x, 2)) + x = randn(Float32, 10, 10, 3, 2) + y = randn(Float32, 20, 20, 3, 2) + ampx = AdaptiveMaxPool((5, 5)) + @test ampx(x) == maxpool(x, PoolDims(x, 2)) + ampx = AdaptiveMeanPool((5, 5)) + @test ampx(x) == meanpool(x, PoolDims(x, 2)) + ampy = AdaptiveMaxPool((10, 5)) + @test ampy(y) == maxpool(y, PoolDims(y, (2, 4))) + ampy = AdaptiveMeanPool((10, 5)) + @test ampy(y) == meanpool(y, PoolDims(y, (2, 4))) + gmp = GlobalMaxPool() + @test size(gmp(x)) == (1, 1, 3, 2) + gmp = GlobalMeanPool() + @test size(gmp(x)) == (1, 1, 3, 2) + mp = MaxPool((2, 2)) + @test mp(x) == maxpool(x, PoolDims(x, 2)) + mp = MeanPool((2, 2)) + @test mp(x) == meanpool(x, PoolDims(x, 2)) end @testset "CNN" begin - r = zeros(Float32, 28, 28, 1, 5) - m = Chain( - Conv((2, 2), 1 => 16, relu), - MaxPool((2,2)), - Conv((2, 2), 16 => 8, relu), - MaxPool((2,2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), softmax) - - @test size(m(r)) == (10, 5) - - # Test bias switch - bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3)) - ip = zeros(Float32, 28,28,1,1) - - op = bias(ip) - @test sum(op) == prod(size(op)) - - @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32) - bias = Conv((2,2), 1=>3, bias = false) |> lmap + r = zeros(Float32, 28, 28, 1, 5) + m = Chain(Conv((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + Conv((2, 2), 16 => 8, relu), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), softmax) + + @test size(m(r)) == (10, 5) + + # Test bias switch + bias = Conv(ones(Float32, 2, 2, 1, 3), ones(Float32, 3)) + ip = zeros(Float32, 28, 28, 1, 1) + op = bias(ip) - @test sum(op) ≈ 0.f0 - gs = gradient(() -> sum(bias(ip)), Flux.params(bias)) - @test bias.bias ∉ gs.params - end - - # Train w/o bias and make sure no convergence happens - # when only bias can be converged - bias = Conv((2, 2), 1=>3, bias = false); - ip = zeros(Float32, 28,28,1,1) - op = zeros(Float32, 27,27,3,1) .+ 2.f0 - opt = Descent() - - for _ = 1:10^3 - gs = gradient(Flux.params(bias)) do - Flux.Losses.mse(bias(ip), op) + @test sum(op) == prod(size(op)) + + @testset "No bias mapped through $lmap" for lmap in (identity, cpu, f32) + bias = Conv((2, 2), 1 => 3, bias = false) |> lmap + op = bias(ip) + @test sum(op) ≈ 0.0f0 + gs = gradient(() -> sum(bias(ip)), Flux.params(bias)) + @test bias.bias ∉ gs.params end - Flux.Optimise.update!(opt, params(bias), gs) - end - @test Flux.Losses.mse(bias(ip), op) ≈ 4.f0 + # Train w/o bias and make sure no convergence happens + # when only bias can be converged + bias = Conv((2, 2), 1 => 3, bias = false) + ip = zeros(Float32, 28, 28, 1, 1) + op = zeros(Float32, 27, 27, 3, 1) .+ 2.0f0 + opt = Descent() - @testset "Grouped Conv" begin - ip = rand(Float32, 28, 100, 2) - c = Conv((3,), 100 => 25, groups = 5) - @test size(c.weight) == (3, 20, 25) - @test size(c(ip)) == (26, 25, 2) + for _ in 1:(10^3) + gs = gradient(Flux.params(bias)) do + return Flux.Losses.mse(bias(ip), op) + end + Flux.Optimise.update!(opt, params(bias), gs) + end - ip = rand(Float32, 28, 28, 100, 2) - c = Conv((3,3), 100 => 25, groups = 5) - @test size(c.weight) == (3, 3, 20, 25) - @test size(c(ip)) == (26, 26, 25, 2) + @test Flux.Losses.mse(bias(ip), op) ≈ 4.0f0 - ip = rand(Float32, 10, 11, 12, 100, 2) - c = Conv((3,4,5), 100 => 25, groups = 5) - @test size(c.weight) == (3,4,5, 20, 25) - @test size(c(ip)) == (8,8,8, 25, 2) + @testset "Grouped Conv" begin + ip = rand(Float32, 28, 100, 2) + c = Conv((3,), 100 => 25, groups = 5) + @test size(c.weight) == (3, 20, 25) + @test size(c(ip)) == (26, 25, 2) - # Test that we cannot ask for non-integer multiplication factors - @test_throws AssertionError Conv((2, 2), 3=>10, groups=2) - @test_throws AssertionError Conv((2, 2), 2=>9, groups=2) - end + ip = rand(Float32, 28, 28, 100, 2) + c = Conv((3, 3), 100 => 25, groups = 5) + @test size(c.weight) == (3, 3, 20, 25) + @test size(c(ip)) == (26, 26, 25, 2) + + ip = rand(Float32, 10, 11, 12, 100, 2) + c = Conv((3, 4, 5), 100 => 25, groups = 5) + @test size(c.weight) == (3, 4, 5, 20, 25) + @test size(c(ip)) == (8, 8, 8, 25, 2) + + # Test that we cannot ask for non-integer multiplication factors + @test_throws AssertionError Conv((2, 2), 3 => 10, groups = 2) + @test_throws AssertionError Conv((2, 2), 2 => 9, groups = 2) + end end @testset "_channels_in, _channels_out" begin _channels_in = Flux._channels_in _channels_out = Flux._channels_out - @test _channels_in(Conv((3,) , 2=>4)) == 2 - @test _channels_in(Conv((5,6,) , 2=>4)) == 2 - @test _channels_in(Conv((1,2,3), 2=>4)) == 2 - @test _channels_out(Conv((3,) , 2=>4)) == 4 - @test _channels_out(Conv((5,6,) , 2=>4)) == 4 - @test _channels_out(Conv((1,2,3), 2=>4)) == 4 - - @test _channels_in( ConvTranspose((3,) , 1=>4)) == 1 - @test _channels_in( ConvTranspose((5,6,) , 2=>4)) == 2 - @test _channels_in( ConvTranspose((1,2,3), 3=>4)) == 3 - @test _channels_out(ConvTranspose((3,) , 2=>1)) == 1 - @test _channels_out(ConvTranspose((5,6,) , 2=>2)) == 2 - @test _channels_out(ConvTranspose((1,2,3), 2=>3)) == 3 - - @test _channels_in( ConvTranspose((6,) , 8=>4, groups=4)) == 8 - @test _channels_in( ConvTranspose((5,6,) , 2=>4, groups=2)) == 2 - @test _channels_in( ConvTranspose((1,2,3), 3=>6, groups=3)) == 3 - - @test _channels_out(ConvTranspose((1,) , 10=>15, groups=5)) == 15 - @test _channels_out(ConvTranspose((3,2) , 10=>15, groups=5)) == 15 - @test _channels_out(ConvTranspose((5,6,) , 2=>2, groups=2)) == 2 + @test _channels_in(Conv((3,), 2 => 4)) == 2 + @test _channels_in(Conv((5, 6), 2 => 4)) == 2 + @test _channels_in(Conv((1, 2, 3), 2 => 4)) == 2 + @test _channels_out(Conv((3,), 2 => 4)) == 4 + @test _channels_out(Conv((5, 6), 2 => 4)) == 4 + @test _channels_out(Conv((1, 2, 3), 2 => 4)) == 4 + + @test _channels_in(ConvTranspose((3,), 1 => 4)) == 1 + @test _channels_in(ConvTranspose((5, 6), 2 => 4)) == 2 + @test _channels_in(ConvTranspose((1, 2, 3), 3 => 4)) == 3 + @test _channels_out(ConvTranspose((3,), 2 => 1)) == 1 + @test _channels_out(ConvTranspose((5, 6), 2 => 2)) == 2 + @test _channels_out(ConvTranspose((1, 2, 3), 2 => 3)) == 3 + + @test _channels_in(ConvTranspose((6,), 8 => 4, groups = 4)) == 8 + @test _channels_in(ConvTranspose((5, 6), 2 => 4, groups = 2)) == 2 + @test _channels_in(ConvTranspose((1, 2, 3), 3 => 6, groups = 3)) == 3 + + @test _channels_out(ConvTranspose((1,), 10 => 15, groups = 5)) == 15 + @test _channels_out(ConvTranspose((3, 2), 10 => 15, groups = 5)) == 15 + @test _channels_out(ConvTranspose((5, 6), 2 => 2, groups = 2)) == 2 for Layer in [Conv, ConvTranspose] for _ in 1:10 @@ -119,170 +118,173 @@ end kernel_size = Tuple(rand(1:5) for _ in rand(1:3)) cin = rand(1:5) * groups cout = rand(1:5) * groups - @test _channels_in(Layer(kernel_size, cin=>cout; groups)) == cin - @test _channels_out(Layer(kernel_size, cin=>cout; groups)) == cout + @test _channels_in(Layer(kernel_size, cin => cout; groups)) == cin + @test _channels_out(Layer(kernel_size, cin => cout; groups)) == cout end end end @testset "asymmetric padding" begin - r = ones(Float32, 28, 28, 1, 1) - m = Conv((3, 3), 1=>1, relu; pad=(0,1,1,2)) - m.weight[:] .= 1.0 - m.bias[:] .= 0.0 - y_hat = m(r)[:,:,1,1] - @test size(y_hat) == (27, 29) - @test y_hat[1, 1] ≈ 6.0 - @test y_hat[2, 2] ≈ 9.0 - @test y_hat[end, 1] ≈ 4.0 - @test y_hat[1, end] ≈ 3.0 - @test y_hat[1, end-1] ≈ 6.0 - @test y_hat[end, end] ≈ 2.0 + r = ones(Float32, 28, 28, 1, 1) + m = Conv((3, 3), 1 => 1, relu; pad = (0, 1, 1, 2)) + m.weight[:] .= 1.0 + m.bias[:] .= 0.0 + y_hat = m(r)[:, :, 1, 1] + @test size(y_hat) == (27, 29) + @test y_hat[1, 1] ≈ 6.0 + @test y_hat[2, 2] ≈ 9.0 + @test y_hat[end, 1] ≈ 4.0 + @test y_hat[1, end] ≈ 3.0 + @test y_hat[1, end - 1] ≈ 6.0 + @test y_hat[end, end] ≈ 2.0 end @testset "Depthwise Conv" begin - r = zeros(Float32, 28, 28, 3, 5) - m1 = DepthwiseConv((2, 2), 3=>15) - @test size(m1(r), 3) == 15 + r = zeros(Float32, 28, 28, 3, 5) + m1 = DepthwiseConv((2, 2), 3 => 15) + @test size(m1(r), 3) == 15 - m2 = DepthwiseConv((2, 3), 3=>9) - @test size(m2(r), 3) == 9 + m2 = DepthwiseConv((2, 3), 3 => 9) + @test size(m2(r), 3) == 9 - m3 = DepthwiseConv((2, 3), 3=>9; bias=false) - @test size(m2(r), 3) == 9 + m3 = DepthwiseConv((2, 3), 3 => 9; bias = false) + @test size(m2(r), 3) == 9 - # Test that we cannot ask for non-integer multiplication factors - @test_throws AssertionError DepthwiseConv((2,2), 3=>10) + # Test that we cannot ask for non-integer multiplication factors + @test_throws AssertionError DepthwiseConv((2, 2), 3 => 10) end @testset "ConvTranspose" begin - x = zeros(Float32, 5, 5, 1, 1) - y = Conv((3,3), 1 => 1)(x) - x_hat1 = ConvTranspose((3, 3), 1 => 1)(y) - x_hat2 = ConvTranspose((3, 3), 1 => 1, bias=false)(y) - @test size(x_hat1) == size(x_hat2) == size(x) - - m = ConvTranspose((3,3), 1=>1) - # Test that the gradient call does not throw: #900 - @test gradient(()->sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads - - x = zeros(Float32, 5, 5, 2, 4) - m = ConvTranspose((3,3), 2=>3) - @test gradient(()->sum(m(x)), params(m)) isa Flux.Zygote.Grads - - # test ConvTranspose supports groups argument - x = randn(Float32, 10, 10, 2, 3) - m1 = ConvTranspose((3,3), 2=>4, pad=SamePad()) - @test size(m1.weight) == (3,3,4,2) - @test size(m1(x)) == (10,10,4,3) - m2 = ConvTranspose((3,3), 2=>4, groups=2, pad=SamePad()) - @test size(m2.weight) == (3,3,2,2) - @test size(m1(x)) == size(m2(x)) - @test gradient(()->sum(m2(x)), params(m2)) isa Flux.Zygote.Grads - - x = randn(Float32, 10, 2,1) - m = ConvTranspose((3,), 2=>4, pad=SamePad(), groups=2) - @test size(m(x)) === (10,4,1) - @test length(m.weight) == (3)*(2*4) / 2 - - x = randn(Float32, 10, 11, 4,2) - m = ConvTranspose((3,5), 4=>4, pad=SamePad(), groups=4) - @test size(m(x)) === (10,11, 4,2) - @test length(m.weight) == (3*5)*(4*4)/4 - - x = randn(Float32, 10, 11, 12, 3,2) - m = ConvTranspose((3,5,3), 3=>6, pad=SamePad(), groups=3) - @test size(m(x)) === (10,11, 12, 6,2) - @test length(m.weight) == (3*5*3) * (3*6) / 3 - - @test occursin("groups=2", sprint(show, ConvTranspose((3,3), 2=>4, groups=2))) - @test occursin("2 => 4" , sprint(show, ConvTranspose((3,3), 2=>4, groups=2))) + x = zeros(Float32, 5, 5, 1, 1) + y = Conv((3, 3), 1 => 1)(x) + x_hat1 = ConvTranspose((3, 3), 1 => 1)(y) + x_hat2 = ConvTranspose((3, 3), 1 => 1, bias = false)(y) + @test size(x_hat1) == size(x_hat2) == size(x) + + m = ConvTranspose((3, 3), 1 => 1) + # Test that the gradient call does not throw: #900 + @test gradient(() -> sum(m(x)), Flux.params(m)) isa Flux.Zygote.Grads + + x = zeros(Float32, 5, 5, 2, 4) + m = ConvTranspose((3, 3), 2 => 3) + @test gradient(() -> sum(m(x)), params(m)) isa Flux.Zygote.Grads + + # test ConvTranspose supports groups argument + x = randn(Float32, 10, 10, 2, 3) + m1 = ConvTranspose((3, 3), 2 => 4, pad = SamePad()) + @test size(m1.weight) == (3, 3, 4, 2) + @test size(m1(x)) == (10, 10, 4, 3) + m2 = ConvTranspose((3, 3), 2 => 4, groups = 2, pad = SamePad()) + @test size(m2.weight) == (3, 3, 2, 2) + @test size(m1(x)) == size(m2(x)) + @test gradient(() -> sum(m2(x)), params(m2)) isa Flux.Zygote.Grads + + x = randn(Float32, 10, 2, 1) + m = ConvTranspose((3,), 2 => 4, pad = SamePad(), groups = 2) + @test size(m(x)) === (10, 4, 1) + @test length(m.weight) == (3) * (2 * 4) / 2 + + x = randn(Float32, 10, 11, 4, 2) + m = ConvTranspose((3, 5), 4 => 4, pad = SamePad(), groups = 4) + @test size(m(x)) === (10, 11, 4, 2) + @test length(m.weight) == (3 * 5) * (4 * 4) / 4 + + x = randn(Float32, 10, 11, 12, 3, 2) + m = ConvTranspose((3, 5, 3), 3 => 6, pad = SamePad(), groups = 3) + @test size(m(x)) === (10, 11, 12, 6, 2) + @test length(m.weight) == (3 * 5 * 3) * (3 * 6) / 3 + + @test occursin("groups=2", sprint(show, ConvTranspose((3, 3), 2 => 4, groups = 2))) + @test occursin("2 => 4", sprint(show, ConvTranspose((3, 3), 2 => 4, groups = 2))) end @testset "CrossCor" begin - x = rand(Float32, 28, 28, 1, 1) - w = rand(Float32, 2,2,1,1) - y = CrossCor(w, [0.0]) - - @test sum(w .* x[1:2, 1:2, :, :]) ≈ y(x)[1, 1, 1, 1] rtol=2e-7 - - r = zeros(Float32, 28, 28, 1, 5) - m = Chain( - CrossCor((2, 2), 1=>16, relu), - MaxPool((2,2)), - CrossCor((2, 2), 16=>8, relu; bias=false), - MaxPool((2,2)), - x -> reshape(x, :, size(x, 4)), - Dense(288, 10), softmax) - - @test size(m(r)) == (10, 5) - @test y(x) != Conv(w, [0.0])(x) - @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x) ≈ Conv(w, [0.0])(x) rtol=1e-7 + x = rand(Float32, 28, 28, 1, 1) + w = rand(Float32, 2, 2, 1, 1) + y = CrossCor(w, [0.0]) + + @test sum(w .* x[1:2, 1:2, :, :])≈y(x)[1, 1, 1, 1] rtol=2e-7 + + r = zeros(Float32, 28, 28, 1, 5) + m = Chain(CrossCor((2, 2), 1 => 16, relu), + MaxPool((2, 2)), + CrossCor((2, 2), 16 => 8, relu; bias = false), + MaxPool((2, 2)), + x -> reshape(x, :, size(x, 4)), + Dense(288, 10), softmax) + + @test size(m(r)) == (10, 5) + @test y(x) != Conv(w, [0.0])(x) + @test CrossCor(w[end:-1:1, end:-1:1, :, :], [0.0])(x)≈Conv(w, [0.0])(x) rtol=1e-7 end @testset "Conv with non quadratic window #700" begin - data = zeros(Float32, 7,7,1,1) - data[4,4,1,1] = 1 - - l = Conv((3,3), 1=>1) - expected = zeros(eltype(l.weight),5,5,1,1) - expected[2:end-1,2:end-1,1,1] = l.weight - @test expected ≈ l(data) - - l = Conv((3,1), 1=>1) - expected = zeros(eltype(l.weight),5,7,1,1) - expected[2:end-1,4,1,1] = l.weight - @test expected ≈ l(data) - - l = Conv((1,3), 1=>1) - expected = zeros(eltype(l.weight),7,5,1,1) - expected[4,2:end-1,1,1] = l.weight - @test expected ≈ l(data) - - @test begin - # we test that the next expression does not throw - randn(Float32, 10,10,1,1) |> Conv((6,1), 1=>1, Flux.σ) - true - end + data = zeros(Float32, 7, 7, 1, 1) + data[4, 4, 1, 1] = 1 + + l = Conv((3, 3), 1 => 1) + expected = zeros(eltype(l.weight), 5, 5, 1, 1) + expected[2:(end - 1), 2:(end - 1), 1, 1] = l.weight + @test expected ≈ l(data) + + l = Conv((3, 1), 1 => 1) + expected = zeros(eltype(l.weight), 5, 7, 1, 1) + expected[2:(end - 1), 4, 1, 1] = l.weight + @test expected ≈ l(data) + + l = Conv((1, 3), 1 => 1) + expected = zeros(eltype(l.weight), 7, 5, 1, 1) + expected[4, 2:(end - 1), 1, 1] = l.weight + @test expected ≈ l(data) + + @test begin + # we test that the next expression does not throw + randn(Float32, 10, 10, 1, 1) |> Conv((6, 1), 1 => 1, Flux.σ) + true + end end -@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, CrossCor), k in ( (1,), (2,), (3,), (4,5), (6,7,8)) - data = ones(Float32, (k .+ 3)..., 1,1) - l = ltype(k, 1=>1, pad=SamePad()) - @test size(l(data)) == size(data) - - l = ltype(k, 1=>1, pad=SamePad(), dilation = k .÷ 2) - @test size(l(data)) == size(data) - - stride = 3 - l = ltype(k, 1=>1, pad=SamePad(), stride = stride) - if ltype == ConvTranspose - @test size(l(data))[1:end-2] == stride .* size(data)[1:end-2] - else - @test size(l(data))[1:end-2] == cld.(size(data)[1:end-2], stride) - end +@testset "$ltype SamePad kernelsize $k" for ltype in (Conv, ConvTranspose, DepthwiseConv, + CrossCor), + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) + + data = ones(Float32, (k .+ 3)..., 1, 1) + l = ltype(k, 1 => 1, pad = SamePad()) + @test size(l(data)) == size(data) + + l = ltype(k, 1 => 1, pad = SamePad(), dilation = k .÷ 2) + @test size(l(data)) == size(data) + + stride = 3 + l = ltype(k, 1 => 1, pad = SamePad(), stride = stride) + if ltype == ConvTranspose + @test size(l(data))[1:(end - 2)] == stride .* size(data)[1:(end - 2)] + else + @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], stride) + end end -@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), k in ( (1,), (2,), (3,), (4,5), (6,7,8)) - data = ones(Float32, (k .+ 3)..., 1,1) +@testset "$ltype SamePad windowsize $k" for ltype in (MeanPool, MaxPool), + k in ((1,), (2,), (3,), (4, 5), (6, 7, 8)) + + data = ones(Float32, (k .+ 3)..., 1, 1) - l = ltype(k, pad=SamePad()) - @test size(l(data))[1:end-2] == cld.(size(data)[1:end-2], k) + l = ltype(k, pad = SamePad()) + @test size(l(data))[1:(end - 2)] == cld.(size(data)[1:(end - 2)], k) end @testset "bugs fixed" begin - # https://github.com/FluxML/Flux.jl/issues/1421 - @test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} -end +# https://github.com/FluxML/Flux.jl/issues/1421 +@test Conv((5, 5), 10 => 20, identity; init = Base.randn).bias isa Vector{Float64} end @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv] - @test fun(rand(2,3,4)).bias isa Vector{Float64} - @test fun(rand(2,3,4,5), false).bias === false - if fun == Conv - @test fun(rand(2,3,4,5,6), rand(6)).bias isa Vector{Float64} - @test_skip fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64} - elseif fun == DepthwiseConv - @test fun(rand(2,3,4,5,6), rand(30)).bias isa Vector{Float64} - end - @test_throws DimensionMismatch fun(rand(2,3,4), rand(6)) + @test fun(rand(2, 3, 4)).bias isa Vector{Float64} + @test fun(rand(2, 3, 4, 5), false).bias === false + if fun == Conv + @test fun(rand(2, 3, 4, 5, 6), rand(6)).bias isa Vector{Float64} + @test_skip fun(rand(2, 3, 4, 5, 6), 1:6).bias isa Vector{Float64} + elseif fun == DepthwiseConv + @test fun(rand(2, 3, 4, 5, 6), rand(30)).bias isa Vector{Float64} + end + @test_throws DimensionMismatch fun(rand(2, 3, 4), rand(6)) end diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 7ae15aeff9..7a42c5b8e4 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -3,14 +3,13 @@ using Zygote: pullback evalwgrad(f, x...) = pullback(f, x...)[1] -@testset "Dropout" begin - @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1.0+0im,2.0+1im,3.0+3im] +@testset "Dropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0 + 0im, 2.0 + 1im, 3.0 + 3im] @test x == Dropout(0.1; rng_kwargs...)(x) @test x == evalwgrad(Dropout(0; rng_kwargs...), x) @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) - x = [1.,2.,3.] + x = [1.0, 2.0, 3.0] @test x == Dropout(0.1; rng_kwargs...)(x) @test x == evalwgrad(Dropout(0; rng_kwargs...), x) @test zero(x) == evalwgrad(Dropout(1; rng_kwargs...), x) @@ -18,31 +17,31 @@ evalwgrad(f, x...) = pullback(f, x...)[1] x = rand(100) m = Dropout(0.9; rng_kwargs...) y = evalwgrad(m, x) - @test count(a->a==0, y) > 50 + @test count(a -> a == 0, y) > 50 testmode!(m, true) y = evalwgrad(m, x) # should override istraining - @test count(a->a==0, y) == 0 + @test count(a -> a == 0, y) == 0 testmode!(m, false) y = evalwgrad(m, x) - @test count(a->a==0, y) > 50 + @test count(a -> a == 0, y) > 50 x = rand(Float32, 100) - m = Chain(Dense(100,100), + m = Chain(Dense(100, 100), Dropout(0.9; rng_kwargs...)) y = evalwgrad(m, x) - @test count(a->a == 0, y) > 50 + @test count(a -> a == 0, y) > 50 testmode!(m, true) y = evalwgrad(m, x) # should override istraining - @test count(a->a == 0, y) == 0 + @test count(a -> a == 0, y) == 0 x = rand(100, 50) m = Dropout(0.5; dims = 2, rng_kwargs...) y = m(x) - c = map(i->count(a->a==0, @view y[i, :]), 1:100) + c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100) @test minimum(c) == maximum(c) m = Dropout(0.5; dims = 1, rng_kwargs...) y = m(x) - c = map(i->count(a->a==0, @view y[:, i]), 1:50) + c = map(i -> count(a -> a == 0, @view y[:, i]), 1:50) @test minimum(c) == maximum(c) # issue #1084 @@ -51,33 +50,31 @@ evalwgrad(f, x...) = pullback(f, x...)[1] testmode!(m) y = m(x) - @test count(a->a == 0, y) == 0 + @test count(a -> a == 0, y) == 0 trainmode!(m) y = m(x) - @test count(a->a == 0, y) > 50 + @test count(a -> a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) - @test count(a->a == 0, y) > 50 + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active = true) + @test count(a -> a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) - @test count(a->a == 0, y) == 0 + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active = false) + @test count(a -> a == 0, y) == 0 # CPU RNGs map onto CPU ok if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG - else - @test cpu(m).rng isa Random._GLOBAL_RNG - end + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end else - @test cpu(m).rng === only(values(rng_kwargs)) + @test cpu(m).rng === only(values(rng_kwargs)) end - end -end +end end -@testset "AlphaDropout" begin - @testset for rng_kwargs in ((), (; rng = MersenneTwister())) - x = [1., 2., 3.] +@testset "AlphaDropout" begin @testset for rng_kwargs in ((), (; rng = MersenneTwister())) + x = [1.0, 2.0, 3.0] @test x == AlphaDropout(0.1; rng_kwargs...)(x) @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x) @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x) @@ -87,379 +84,392 @@ end y = evalwgrad(m, x) # Should preserve unit mean and variance - @test mean(y) ≈ 0 atol=0.2 - @test var(y) ≈ 1 atol=0.2 + @test mean(y)≈0 atol=0.2 + @test var(y)≈1 atol=0.2 testmode!(m, true) # should override istraining @test evalwgrad(m, x) == x testmode!(m, false) y = evalwgrad(m, x) - @test mean(y) ≈ 0 atol=0.2 - @test var(y) ≈ 1 atol=0.2 + @test mean(y)≈0 atol=0.2 + @test var(y)≈1 atol=0.2 # Known good value ranges # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338 x = ones(100) if isempty(rng_kwargs) - @test 40 < sum(evalwgrad(m, x)) < 130 + @test 40 < sum(evalwgrad(m, x)) < 130 else - # FIXME: this breaks spuriously for MersenneTwister - @test_skip 40 < sum(evalwgrad(m, x)) < 130 + # FIXME: this breaks spuriously for MersenneTwister + @test_skip 40 < sum(evalwgrad(m, x)) < 130 end # CPU RNGs map onto CPU ok if isempty(rng_kwargs) - if VERSION >= v"1.7" - @test cpu(m).rng isa Random.TaskLocalRNG - else - @test cpu(m).rng isa Random._GLOBAL_RNG - end + if VERSION >= v"1.7" + @test cpu(m).rng isa Random.TaskLocalRNG + else + @test cpu(m).rng isa Random._GLOBAL_RNG + end else - @test cpu(m).rng === only(values(rng_kwargs)) + @test cpu(m).rng === only(values(rng_kwargs)) end - end -end +end end @testset "BatchNorm" begin - let m = BatchNorm(2), x = [1.0 3.0 5.0; - 2.0 4.0 6.0] + let m = BatchNorm(2), x = [1.0 3.0 5.0; + 2.0 4.0 6.0] + @test Flux.hasaffine(m) == true + @test length(Flux.params(m)) == 2 + + @test m.β == [0, 0] # initβ(2) + @test m.γ == [1, 1] # initγ(2) + # initial m.σ is 1 + # initial m.μ is 0 + + y = evalwgrad(m, x) + @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5) + # julia> x + # 2×3 Array{Float64,2}: + # 1.0 3.0 5.0 + # 2.0 4.0 6.0 + # + # μ of batch will be + # (1. + 3. + 5.) / 3 = 3 + # (2. + 4. + 6.) / 3 = 4 + # + # ∴ update rule with momentum: + # .1 * 3 + 0 = .3 + # .1 * 4 + 0 = .4 + @test m.μ ≈ reshape([0.3, 0.4], 2, 1) + + # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] + # 2×1 Array{Float64,2}: + # 1.3 + # 1.3 + @test m.σ² ≈ + 0.1 .* var(x, dims = 2, corrected = false) .* (3 / 2) .+ 0.9 .* [1.0, 1.0] + + x′ = m(x) + @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5) + + @inferred m(x) + end - @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 + let m = BatchNorm(2; track_stats = false), x = [1.0 3.0 5.0; 2.0 4.0 6.0] + @inferred m(x) + end - @test m.β == [0, 0] # initβ(2) - @test m.γ == [1, 1] # initγ(2) - # initial m.σ is 1 - # initial m.μ is 0 + # with activation function + let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0; + 2.0 4.0 6.0] + y = m(x) + @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7) + @inferred m(x) + end - y = evalwgrad(m, x) - @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5) - # julia> x - # 2×3 Array{Float64,2}: - # 1.0 3.0 5.0 - # 2.0 4.0 6.0 - # - # μ of batch will be - # (1. + 3. + 5.) / 3 = 3 - # (2. + 4. + 6.) / 3 = 4 - # - # ∴ update rule with momentum: - # .1 * 3 + 0 = .3 - # .1 * 4 + 0 = .4 - @test m.μ ≈ reshape([0.3, 0.4], 2, 1) - - # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] - # 2×1 Array{Float64,2}: - # 1.3 - # 1.3 - @test m.σ² ≈ .1 .* var(x, dims=2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.] - - x′ = m(x) - @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5) - - @inferred m(x) - end - - let m = BatchNorm(2; track_stats=false), x = [1.0 3.0 5.0; 2.0 4.0 6.0] - @inferred m(x) - end - - # with activation function - let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0; - 2.0 4.0 6.0] - y = m(x) - @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7) - @inferred m(x) - end - - let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1) - y = reshape(permutedims(x, [2, 1, 3]), 2, :) - y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3]) - @test m(x) == y - @inferred m(x) - end - - let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1) - y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :) - y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4]) - @test m(x) == y - @inferred m(x) - end - - let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1) - y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :) - y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5]) - @test m(x) == y - @inferred m(x) - end - - let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1); - m(x) - @test (@allocated m(x)) < 100_000_000 - @inferred m(x) - end - - @test length(Flux.params(BatchNorm(10))) == 2 - @test length(Flux.params(BatchNorm(10, affine=true))) == 2 - @test length(Flux.params(BatchNorm(10, affine=false))) == 0 + let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1) + y = reshape(permutedims(x, [2, 1, 3]), 2, :) + y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3]) + @test m(x) == y + @inferred m(x) + end + + let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1) + y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :) + y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4]) + @test m(x) == y + @inferred m(x) + end + + let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1) + y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :) + y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5]) + @test m(x) == y + @inferred m(x) + end + + let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1) + m(x) + @test (@allocated m(x)) < 100_000_000 + @inferred m(x) + end + + @test length(Flux.params(BatchNorm(10))) == 2 + @test length(Flux.params(BatchNorm(10, affine = true))) == 2 + @test length(Flux.params(BatchNorm(10, affine = false))) == 0 end @testset "InstanceNorm" begin - # begin tests - let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2), + # begin tests + let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 - x = Float32.(x) - @test m.β == [0, 0] # initβ(2) - @test m.γ == [1, 1] # initγ(2) - y = evalwgrad(m, x) - - #julia> x - #[:, :, 1] = - # 1.0 4.0 - # 2.0 5.0 - # 3.0 6.0 - # - #[:, :, 2] = - # 7.0 10.0 - # 8.0 11.0 - # 9.0 12.0 - # - # μ will be - # (1. + 2. + 3.) / 3 = 2. - # (4. + 5. + 6.) / 3 = 5. - # - # (7. + 8. + 9.) / 3 = 8. - # (10. + 11. + 12.) / 3 = 11. - # - # ∴ update rule with momentum: - # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5 - # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8 - N = ndims(x) - @test m.μ ≈ [0.5, 0.8] - n = prod(size(x,i) for i in 1:N-2) - corr = n / (n-1) - σ² = var(x, dims=1:N-2, corrected=false) - @test m.σ² ≈ 0.1*corr*vec(mean(σ², dims=N)) .+ 0.9 * 1 - - y = m(x) - @test length(m.μ) == 2 - @test length(m.σ²) == 2 - @test y ≈ (x .- reshape(m.μ, 1,2,1)) ./ sqrt.(reshape(m.σ², 1,2,1) .+ 1f-5) atol=1.0e-5 - - @inferred m(x) - end - - # with activation function - let m = InstanceNorm(2, sigmoid; affine=true, track_stats=true), sizes = (3, 2, 2), - x = reshape(collect(1:prod(sizes)), sizes) - x = Float64.(x) - affine_shape = collect(sizes) - affine_shape[[1,3]] .= 1 + @test length(Flux.params(m)) == 2 + x = Float32.(x) + @test m.β == [0, 0] # initβ(2) + @test m.γ == [1, 1] # initγ(2) + y = evalwgrad(m, x) + + #julia> x + #[:, :, 1] = + # 1.0 4.0 + # 2.0 5.0 + # 3.0 6.0 + # + #[:, :, 2] = + # 7.0 10.0 + # 8.0 11.0 + # 9.0 12.0 + # + # μ will be + # (1. + 2. + 3.) / 3 = 2. + # (4. + 5. + 6.) / 3 = 5. + # + # (7. + 8. + 9.) / 3 = 8. + # (10. + 11. + 12.) / 3 = 11. + # + # ∴ update rule with momentum: + # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5 + # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8 + N = ndims(x) + @test m.μ ≈ [0.5, 0.8] + n = prod(size(x, i) for i in 1:(N - 2)) + corr = n / (n - 1) + σ² = var(x, dims = 1:(N - 2), corrected = false) + @test m.σ² ≈ 0.1 * corr * vec(mean(σ², dims = N)) .+ 0.9 * 1 + + y = m(x) + @test length(m.μ) == 2 + @test length(m.σ²) == 2 + @test y≈(x .- reshape(m.μ, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 2, 1) .+ 1.0f-5) atol=1.0e-5 + + @inferred m(x) + end - y = evalwgrad(m, x) - y = m(x) # inference time after a training step - μ = reshape(m.μ, affine_shape...) - σ² = reshape(m.σ², affine_shape...) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + # with activation function + let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2), + x = reshape(collect(1:prod(sizes)), sizes) - @inferred m(x) - end + x = Float64.(x) + affine_shape = collect(sizes) + affine_shape[[1, 3]] .= 1 - # with activation function - let m = InstanceNorm(2, sigmoid; affine=true, track_stats=false), sizes = (3, 2, 2), - x = reshape(collect(1:prod(sizes)), sizes) + y = evalwgrad(m, x) + y = m(x) # inference time after a training step + μ = reshape(m.μ, affine_shape...) + σ² = reshape(m.σ², affine_shape...) + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 - @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 - x = Float64.(x) - y = m(x) - μ = mean(x, dims=1) - σ² = var(x, dims=1, corrected=false) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + @inferred m(x) + end - @inferred m(x) - end + # with activation function + let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), sizes = (3, 2, 2), + x = reshape(collect(1:prod(sizes)), sizes) - let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2), - x = reshape(collect(1:prod(sizes)), sizes) - @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + @test Flux.hasaffine(m) == true + @test length(Flux.params(m)) == 2 + x = Float64.(x) + y = m(x) + μ = mean(x, dims = 1) + σ² = var(x, dims = 1, corrected = false) + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 - x = Float64.(x) - y = m(x) - μ = mean(x, dims=1) - σ² = var(x, dims=1, corrected=false) - @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 + @inferred m(x) + end - @inferred m(x) - end + let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2), + x = reshape(collect(1:prod(sizes)), sizes) + @test Flux.hasaffine(m) == false + @test length(Flux.params(m)) == 0 - let m = trainmode!(InstanceNorm(2; affine=true)), sizes = (2, 4, 1, 2, 3), - x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) - y = reshape(m(y), sizes...) - @test m(x) == y + x = Float64.(x) + y = m(x) + μ = mean(x, dims = 1) + σ² = var(x, dims = 1, corrected = false) + @test y≈sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ)) atol=1.0e-7 - @inferred m(x) - end + @inferred m(x) + end - # check that μ, σ², and the output are the correct size for higher rank tensors - let m = InstanceNorm(2; affine=true,track_stats=true), sizes = (5, 5, 3, 4, 2, 6), - x = reshape(Float32.(collect(1:prod(sizes))), sizes) - y = evalwgrad(m, x) - @test size(m.μ) == (sizes[end - 1], ) - @test size(m.σ²) == (sizes[end - 1], ) - @test size(y) == sizes + let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3), + x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - @inferred m(x) - end + y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) + y = reshape(m(y), sizes...) + @test m(x) == y - # show that instance norm is equal to batch norm when channel and batch dims are squashed - let m_inorm = trainmode!(InstanceNorm(2; affine=true)), m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6), - x = reshape(Float32.(collect(1:prod(sizes))), sizes) - @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes) - end + @inferred m(x) + end - let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1); - m(x) - @test (@allocated m(x)) < 100_000_000 + # check that μ, σ², and the output are the correct size for higher rank tensors + let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6), + x = reshape(Float32.(collect(1:prod(sizes))), sizes) - @inferred m(x) - end + y = evalwgrad(m, x) + @test size(m.μ) == (sizes[end - 1],) + @test size(m.σ²) == (sizes[end - 1],) + @test size(y) == sizes - @test length(Flux.params(InstanceNorm(10))) == 0 - @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 - @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 + @inferred m(x) + end + + # show that instance norm is equal to batch norm when channel and batch dims are squashed + let m_inorm = trainmode!(InstanceNorm(2; affine = true)), + m_bnorm = trainmode!(BatchNorm(12)), sizes = (5, 5, 3, 4, 2, 6), + x = reshape(Float32.(collect(1:prod(sizes))), sizes) + + @test m_inorm(x) == + reshape(m_bnorm(reshape(x, (sizes[1:(end - 2)]..., :, 1))), sizes) + end + + let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1) + m(x) + @test (@allocated m(x)) < 100_000_000 + + @inferred m(x) + end + + @test length(Flux.params(InstanceNorm(10))) == 0 + @test length(Flux.params(InstanceNorm(10, affine = true))) == 2 + @test length(Flux.params(InstanceNorm(10, affine = false))) == 0 end @testset "LayerNorm" begin - x = rand(2,3) - @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1) - x = rand(2,3,4) - @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1) - x = rand(2,3,4,5) - @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1) - x = rand(2) - @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims=1)) - - x = rand(2,3,4,5) - @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims=(1,2)) - x = rand(2,3,4,5) - @test LayerNorm((2,3,4))(x) ≈ Flux.normalise(x, dims=1:3) - - m = LayerNorm((2,3,4)) - @test Flux.hasaffine(m) == true - @test length(Flux.params(m)) == 2 - m = LayerNorm((2,3,4), affine=false) - @test Flux.hasaffine(m) == false - @test length(Flux.params(m)) == 0 + x = rand(2, 3) + @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1) + x = rand(2, 3, 4) + @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1) + x = rand(2, 3, 4, 5) + @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1) + x = rand(2) + @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims = 1)) + + x = rand(2, 3, 4, 5) + @test LayerNorm((2, 3))(x) ≈ Flux.normalise(x, dims = (1, 2)) + x = rand(2, 3, 4, 5) + @test LayerNorm((2, 3, 4))(x) ≈ Flux.normalise(x, dims = 1:3) + + m = LayerNorm((2, 3, 4)) + @test Flux.hasaffine(m) == true + @test length(Flux.params(m)) == 2 + m = LayerNorm((2, 3, 4), affine = false) + @test Flux.hasaffine(m) == false + @test length(Flux.params(m)) == 0 end @testset "GroupNorm" begin - # begin tests - squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions + # begin tests + squeeze(x) = dropdims(x, dims = tuple(findall(size(x) .== 1)...)) # To remove all singular dimensions - let m = GroupNorm(4,2, track_stats=true), sizes = (3,4,2), + let m = GroupNorm(4, 2, track_stats = true), sizes = (3, 4, 2), x = reshape(collect(1:prod(sizes)), sizes) - @test length(Flux.params(m)) == 2 - x = Float32.(x) - @test m.β == [0, 0, 0, 0] # initβ(32) - @test m.γ == [1, 1, 1, 1] # initγ(32) - - y = evalwgrad(m, x) - - #julia> x - #[:, :, 1] = - # 1.0 4.0 7.0 10.0 - # 2.0 5.0 8.0 11.0 - # 3.0 6.0 9.0 12.0 - # - #[:, :, 2] = - # 13.0 16.0 19.0 22.0 - # 14.0 17.0 20.0 23.0 - # 15.0 18.0 21.0 24.0 - # - # μ will be - # (1. + 2. + 3. + 4. + 5. + 6.) / 6 = 3.5 - # (7. + 8. + 9. + 10. + 11. + 12.) / 6 = 9.5 - # - # (13. + 14. + 15. + 16. + 17. + 18.) / 6 = 15.5 - # (19. + 20. + 21. + 22. + 23. + 24.) / 6 = 21.5 - # - # μ = - # 3.5 15.5 - # 9.5 21.5 - # - # ∴ update rule with momentum: - # (1. - .1) * 0 + .1 * (3.5 + 15.5) / 2 = 0.95 - # (1. - .1) * 0 + .1 * (9.5 + 21.5) / 2 = 1.55 - @test m.μ ≈ [0.95, 1.55] - n = prod(size(x)) ÷ m.G ÷ size(x)[end] - corr = n / (n-1) - z = reshape(x,3,2,2,2) - σ² = var(z, dims=(1,2), corrected=false) - @test m.σ² ≈ 0.1*corr*vec(mean(σ², dims=4)) .+ 0.9 * 1 - - y = m(x) - out = (z .- reshape(m.μ, 1,1,2,1)) ./ sqrt.(reshape(m.σ², 1,1,2,1) .+ 1f-5) - @test y ≈ reshape(out, size(x)) atol=1.0e-5 - end - # with activation function - let m = GroupNorm(4,2, sigmoid, track_stats=true), sizes = (3, 4, 2), - x = reshape(collect(1:prod(sizes)), sizes) - x = Float32.(x) - μ_affine_shape = ones(Int,length(sizes) + 1) - μ_affine_shape[end-1] = 2 # Number of groups - - affine_shape = ones(Int,length(sizes) + 1) - affine_shape[end-2] = 2 # Channels per group - affine_shape[end-1] = 2 # Number of groups - affine_shape[1] = sizes[1] - affine_shape[end] = sizes[end] - - og_shape = size(x) + @test length(Flux.params(m)) == 2 + x = Float32.(x) + @test m.β == [0, 0, 0, 0] # initβ(32) + @test m.γ == [1, 1, 1, 1] # initγ(32) + + y = evalwgrad(m, x) + + #julia> x + #[:, :, 1] = + # 1.0 4.0 7.0 10.0 + # 2.0 5.0 8.0 11.0 + # 3.0 6.0 9.0 12.0 + # + #[:, :, 2] = + # 13.0 16.0 19.0 22.0 + # 14.0 17.0 20.0 23.0 + # 15.0 18.0 21.0 24.0 + # + # μ will be + # (1. + 2. + 3. + 4. + 5. + 6.) / 6 = 3.5 + # (7. + 8. + 9. + 10. + 11. + 12.) / 6 = 9.5 + # + # (13. + 14. + 15. + 16. + 17. + 18.) / 6 = 15.5 + # (19. + 20. + 21. + 22. + 23. + 24.) / 6 = 21.5 + # + # μ = + # 3.5 15.5 + # 9.5 21.5 + # + # ∴ update rule with momentum: + # (1. - .1) * 0 + .1 * (3.5 + 15.5) / 2 = 0.95 + # (1. - .1) * 0 + .1 * (9.5 + 21.5) / 2 = 1.55 + @test m.μ ≈ [0.95, 1.55] + n = prod(size(x)) ÷ m.G ÷ size(x)[end] + corr = n / (n - 1) + z = reshape(x, 3, 2, 2, 2) + σ² = var(z, dims = (1, 2), corrected = false) + @test m.σ² ≈ 0.1 * corr * vec(mean(σ², dims = 4)) .+ 0.9 * 1 + + y = m(x) + out = (z .- reshape(m.μ, 1, 1, 2, 1)) ./ sqrt.(reshape(m.σ², 1, 1, 2, 1) .+ 1.0f-5) + @test y≈reshape(out, size(x)) atol=1.0e-5 + end + # with activation function + let m = GroupNorm(4, 2, sigmoid, track_stats = true), sizes = (3, 4, 2), + x = reshape(collect(1:prod(sizes)), sizes) - y = m(x) - x_ = reshape(x,affine_shape...) - out = reshape(sigmoid.((x_ .- reshape(m.μ,μ_affine_shape...)) ./ sqrt.(reshape(m.σ²,μ_affine_shape...) .+ m.ϵ)),og_shape) - @test y ≈ out atol=1e-7 - end - - let m = trainmode!(GroupNorm(2,2, track_stats=true)), sizes = (2, 4, 1, 2, 3), - x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) - y = reshape(m(y), sizes...) - @test m(x) == y - end - - # check that μ, σ², and the output are the correct size for higher rank tensors - let m = GroupNorm(4,2, track_stats=true), sizes = (5, 5, 3, 4, 4, 6), - x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - y = evalwgrad(m, x) - @test size(m.μ) == (m.G,) - @test size(m.σ²) == (m.G,) - @test size(y) == sizes - end - - # show that group norm is the same as instance norm when the group size is the same as the number of channels - let IN = trainmode!(InstanceNorm(4; affine=true)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,5), - x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - @test IN(x) ≈ GN(x) - end - - # show that group norm is the same as batch norm for a group of size 1 and batch of size 1 - let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4,4)), sizes = (2,2,3,4,1), - x = Float32.(reshape(collect(1:prod(sizes)), sizes)) - @test BN(x) ≈ GN(x) - end + x = Float32.(x) + μ_affine_shape = ones(Int, length(sizes) + 1) + μ_affine_shape[end - 1] = 2 # Number of groups + + affine_shape = ones(Int, length(sizes) + 1) + affine_shape[end - 2] = 2 # Channels per group + affine_shape[end - 1] = 2 # Number of groups + affine_shape[1] = sizes[1] + affine_shape[end] = sizes[end] + + og_shape = size(x) + + y = m(x) + x_ = reshape(x, affine_shape...) + out = reshape(sigmoid.((x_ .- reshape(m.μ, μ_affine_shape...)) ./ + sqrt.(reshape(m.σ², μ_affine_shape...) .+ m.ϵ)), og_shape) + @test y≈out atol=1e-7 + end + + let m = trainmode!(GroupNorm(2, 2, track_stats = true)), sizes = (2, 4, 1, 2, 3), + x = Float32.(reshape(collect(1:prod(sizes)), sizes)) + + y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3) + y = reshape(m(y), sizes...) + @test m(x) == y + end + + # check that μ, σ², and the output are the correct size for higher rank tensors + let m = GroupNorm(4, 2, track_stats = true), sizes = (5, 5, 3, 4, 4, 6), + x = Float32.(reshape(collect(1:prod(sizes)), sizes)) + + y = evalwgrad(m, x) + @test size(m.μ) == (m.G,) + @test size(m.σ²) == (m.G,) + @test size(y) == sizes + end + + # show that group norm is the same as instance norm when the group size is the same as the number of channels + let IN = trainmode!(InstanceNorm(4; affine = true)), GN = trainmode!(GroupNorm(4, 4)), + sizes = (2, 2, 3, 4, 5), + x = Float32.(reshape(collect(1:prod(sizes)), sizes)) + + @test IN(x) ≈ GN(x) + end + + # show that group norm is the same as batch norm for a group of size 1 and batch of size 1 + let BN = trainmode!(BatchNorm(4)), GN = trainmode!(GroupNorm(4, 4)), + sizes = (2, 2, 3, 4, 1), + x = Float32.(reshape(collect(1:prod(sizes)), sizes)) + + @test BN(x) ≈ GN(x) + end end @testset "second derivatives" begin - m1 = Dropout(0.5) - @test Zygote.hessian_reverse(sum∘m1, [1.0,2.0,3.0]) == zeros(3, 3) + m1 = Dropout(0.5) + @test Zygote.hessian_reverse(sum ∘ m1, [1.0, 2.0, 3.0]) == zeros(3, 3) end diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl index facab8466b..e7a2b3d14f 100644 --- a/test/layers/recurrent.jl +++ b/test/layers/recurrent.jl @@ -2,76 +2,79 @@ using LinearAlgebra # Ref FluxML/Flux.jl#1209 1D input @testset "BPTT-1D" begin - seq = [rand(Float32, 2) for i = 1:3] - for r ∈ [RNN,] - rnn = r(2 => 3) - Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do - sum([rnn(s) for s in seq][3]) + seq = [rand(Float32, 2) for i in 1:3] + for r in [RNN] + rnn = r(2 => 3) + Flux.reset!(rnn) + grads_seq = gradient(Flux.params(rnn)) do + return sum([rnn(s) for s in seq][3]) + end + Flux.reset!(rnn) + bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + + Wh * + tanh.(rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + + Wh * rnn.cell.state0 + + rnn.cell.b) + + rnn.cell.b) + + rnn.cell.b)), + rnn.cell.Wh) + @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end - Flux.reset!(rnn); - bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh * - tanh.(rnn.cell.Wi * seq[2] + Wh * - tanh.(rnn.cell.Wi * seq[1] + - Wh * rnn.cell.state0 - + rnn.cell.b) - + rnn.cell.b) - + rnn.cell.b)), - rnn.cell.Wh) - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] - end end # Ref FluxML/Flux.jl#1209 2D input @testset "BPTT-2D" begin - seq = [rand(Float32, (2, 1)) for i = 1:3] - for r ∈ [RNN,] - rnn = r(2 => 3) - Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do - sum([rnn(s) for s in seq][3]) + seq = [rand(Float32, (2, 1)) for i in 1:3] + for r in [RNN] + rnn = r(2 => 3) + Flux.reset!(rnn) + grads_seq = gradient(Flux.params(rnn)) do + return sum([rnn(s) for s in seq][3]) + end + Flux.reset!(rnn) + bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + + Wh * + tanh.(rnn.cell.Wi * seq[2] + + Wh * + tanh.(rnn.cell.Wi * seq[1] + + Wh * rnn.cell.state0 + + rnn.cell.b) + + rnn.cell.b) + + rnn.cell.b)), + rnn.cell.Wh) + @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end - Flux.reset!(rnn); - bptt = gradient(Wh -> sum(tanh.(rnn.cell.Wi * seq[3] + Wh * - tanh.(rnn.cell.Wi * seq[2] + Wh * - tanh.(rnn.cell.Wi * seq[1] + - Wh * rnn.cell.state0 - + rnn.cell.b) - + rnn.cell.b) - + rnn.cell.b)), - rnn.cell.Wh) - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] - end end @testset "BPTT-3D" begin - seq = rand(Float32, (2, 1, 3)) - rnn = RNN(2 => 3) - Flux.reset!(rnn) - grads_seq = gradient(Flux.params(rnn)) do - sum(rnn(seq)[:, :, 3]) - end - Flux.reset!(rnn); - bptt = gradient(rnn.cell.Wh) do Wh - # calculate state 1 - s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] + - Wh * rnn.cell.state0 + - rnn.cell.b) - #calculate state 2 - s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] + - Wh * s1 + - rnn.cell.b) - #calculate state 3 - s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] + - Wh * s2 + - rnn.cell.b) - sum(s3) # loss is sum of state 3 - end - @test grads_seq[rnn.cell.Wh] ≈ bptt[1] + seq = rand(Float32, (2, 1, 3)) + rnn = RNN(2 => 3) + Flux.reset!(rnn) + grads_seq = gradient(Flux.params(rnn)) do + return sum(rnn(seq)[:, :, 3]) + end + Flux.reset!(rnn) + bptt = gradient(rnn.cell.Wh) do Wh + # calculate state 1 + s1 = tanh.(rnn.cell.Wi * seq[:, :, 1] + + Wh * rnn.cell.state0 + + rnn.cell.b) + #calculate state 2 + s2 = tanh.(rnn.cell.Wi * seq[:, :, 2] + + Wh * s1 + + rnn.cell.b) + #calculate state 3 + s3 = tanh.(rnn.cell.Wi * seq[:, :, 3] + + Wh * s2 + + rnn.cell.b) + return sum(s3) # loss is sum of state 3 + end + @test grads_seq[rnn.cell.Wh] ≈ bptt[1] end -@testset "RNN-shapes" begin - @testset for R in [RNN, GRU, LSTM, GRUv3] +@testset "RNN-shapes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] m1 = R(3 => 5) m2 = R(3 => 5) m3 = R(3, 5) # leave one to test the silently deprecated "," not "=>" notation @@ -87,85 +90,85 @@ end @test size(m2(x2)) == (5, 1) @test size(m3(x3)) == (5, 1, 2) @test size(m3(x3)) == (5, 1, 2) - end -end +end end -@testset "RNN-input-state-eltypes" begin - @testset for R in [RNN, GRU, LSTM, GRUv3] +@testset "RNN-input-state-eltypes" begin @testset for R in [RNN, GRU, LSTM, GRUv3] m = R(3 => 5) x = rand(Float64, 3, 1) Flux.reset!(m) @test_throws MethodError m(x) - end -end +end end @testset "multigate" begin - x = rand(6, 5) - res, (dx,) = Flux.withgradient(x) do x - x1, _, x3 = Flux.multigate(x, 2, Val(3)) - sum(x1) + sum(x3 .* 2) - end - @test res == sum(x[1:2, :]) + 2sum(x[5:6, :]) - @test dx == [ones(2, 5); zeros(2, 5); fill(2, 2, 5)] + x = rand(6, 5) + res, (dx,) = Flux.withgradient(x) do x + x1, _, x3 = Flux.multigate(x, 2, Val(3)) + return sum(x1) + sum(x3 .* 2) + end + @test res == sum(x[1:2, :]) + 2sum(x[5:6, :]) + @test dx == [ones(2, 5); zeros(2, 5); fill(2, 2, 5)] end @testset "eachlastdim" begin - x = rand(3, 3, 1, 2, 4) - @test length(Flux.eachlastdim(x)) == size(x, ndims(x)) - @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims=ndims(x))) - slicedim = (size(x)[1:end-1]..., 1) - res, (dx,) = Flux.withgradient(x) do x - x1, _, x3, _ = Flux.eachlastdim(x) - sum(x1) + sum(x3 .* 3) - end - @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3)) - @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim), - fill(3, slicedim), fill(0, slicedim); dims=ndims(x)) + x = rand(3, 3, 1, 2, 4) + @test length(Flux.eachlastdim(x)) == size(x, ndims(x)) + @test collect(@inferred(Flux.eachlastdim(x))) == collect(eachslice(x; dims = ndims(x))) + slicedim = (size(x)[1:(end - 1)]..., 1) + res, (dx,) = Flux.withgradient(x) do x + x1, _, x3, _ = Flux.eachlastdim(x) + return sum(x1) + sum(x3 .* 3) + end + @test res ≈ sum(selectdim(x, ndims(x), 1)) + 3sum(selectdim(x, ndims(x), 3)) + @test dx ≈ cat(fill(1, slicedim), fill(0, slicedim), + fill(3, slicedim), fill(0, slicedim); dims = ndims(x)) end @testset "∇eachlastdim" begin x = rand(3, 3, 1, 2, 4) x_size = size(x) - y = collect(eachslice(x; dims=ndims(x))) + y = collect(eachslice(x; dims = ndims(x))) @test @inferred(Flux.∇eachlastdim(y, x)) == x ZeroTangent = Flux.Zygote.ZeroTangent NoTangent = Flux.Zygote.NoTangent abstract_zeros_vector = [ZeroTangent(), ZeroTangent(), NoTangent(), NoTangent()] @test @inferred(Flux.∇eachlastdim(abstract_zeros_vector, x)) == zeros(size(x)) - x2 = rand(Float64, x_size[1:end-1]) - x3 = rand(Float64, x_size[1:end-1]) + x2 = rand(Float64, x_size[1:(end - 1)]) + x3 = rand(Float64, x_size[1:(end - 1)]) mixed_vector = [ZeroTangent(), x2, x3, ZeroTangent()] - @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:end-1]), - x2, - x3, - zeros(x_size[1:end-1]); dims=ndims(x)) + @test @inferred(Flux.∇eachlastdim(mixed_vector, x)) ≈ cat(zeros(x_size[1:(end - 1)]), + x2, + x3, + zeros(x_size[1:(end - 1)]); dims = ndims(x)) end @testset "Different Internal Matrix Types" begin - R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), rand(5, 1))) - # don't want to pull in SparseArrays just for this test, but there aren't any - # non-square structured matrix types in LinearAlgebra. so we will use a different - # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the - # same type. - L = Flux.Recur(Flux.LSTMCell(rand(5*4, 3), rand(1:20, 5*4, 5), rand(5*4), (rand(5, 1), rand(5, 1)))) - G = Flux.Recur(Flux.GRUCell(rand(5*3, 3), rand(1:20, 5*3, 5), rand(5*3), rand(5, 1))) - G3 = Flux.Recur(Flux.GRUv3Cell(rand(5*3, 3), rand(1:20, 5*2, 5), rand(5*3), Tridiagonal(rand(5, 5)), rand(5, 1))) - - for m in [R, L, G, G3] + R = Flux.Recur(Flux.RNNCell(tanh, rand(5, 3), Tridiagonal(rand(5, 5)), rand(5), + rand(5, 1))) + # don't want to pull in SparseArrays just for this test, but there aren't any + # non-square structured matrix types in LinearAlgebra. so we will use a different + # eltype matrix, which would fail before when `W_i` and `W_h` were required to be the + # same type. + L = Flux.Recur(Flux.LSTMCell(rand(5 * 4, 3), rand(1:20, 5 * 4, 5), rand(5 * 4), + (rand(5, 1), rand(5, 1)))) + G = Flux.Recur(Flux.GRUCell(rand(5 * 3, 3), rand(1:20, 5 * 3, 5), rand(5 * 3), + rand(5, 1))) + G3 = Flux.Recur(Flux.GRUv3Cell(rand(5 * 3, 3), rand(1:20, 5 * 2, 5), rand(5 * 3), + Tridiagonal(rand(5, 5)), rand(5, 1))) - x1 = rand(3) - x2 = rand(3, 1) - x3 = rand(3, 1, 2) - Flux.reset!(m) - @test size(m(x1)) == (5,) - Flux.reset!(m) - @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape - @test size(m(x2)) == (5, 1) - Flux.reset!(m) - @test size(m(x2)) == (5, 1) - Flux.reset!(m) - @test size(m(x3)) == (5, 1, 2) - Flux.reset!(m) - @test size(m(x3)) == (5, 1, 2) - end + for m in [R, L, G, G3] + x1 = rand(3) + x2 = rand(3, 1) + x3 = rand(3, 1, 2) + Flux.reset!(m) + @test size(m(x1)) == (5,) + Flux.reset!(m) + @test size(m(x1)) == (5,) # repeat in case of effect from change in state shape + @test size(m(x2)) == (5, 1) + Flux.reset!(m) + @test size(m(x2)) == (5, 1) + Flux.reset!(m) + @test size(m(x3)) == (5, 1, 2) + Flux.reset!(m) + @test size(m(x3)) == (5, 1, 2) + end end diff --git a/test/layers/show.jl b/test/layers/show.jl index 3fc9bd097b..08d1c845f7 100644 --- a/test/layers/show.jl +++ b/test/layers/show.jl @@ -1,75 +1,73 @@ @testset "layer printing" begin # 2-arg show, defined with layes - - @test repr(Dense(2,3)) == "Dense(2 => 3)" - @test repr(Chain(Dense(2,3))) == "Chain(Dense(2 => 3))" - @test repr(Chain(lay=Dense(2,3))) == "Chain(lay = Dense(2 => 3))" - + @test repr(Dense(2, 3)) == "Dense(2 => 3)" + @test repr(Chain(Dense(2, 3))) == "Chain(Dense(2 => 3))" + @test repr(Chain(lay = Dense(2, 3))) == "Chain(lay = Dense(2 => 3))" end @testset "nested model printing" begin # 3-arg show, defined in show.jl - # Dense -- has parameter count, but not when inside a matrix: + # Dense -- has parameter count, but not when inside a matrix: - toplevel_dense = repr("text/plain", Dense(2,3)) - @test occursin("Dense(2 => 3)", toplevel_dense) - @test occursin("# 9 parameters", toplevel_dense) + toplevel_dense = repr("text/plain", Dense(2, 3)) + @test occursin("Dense(2 => 3)", toplevel_dense) + @test occursin("# 9 parameters", toplevel_dense) - @test Meta.isexpr(Meta.parse(toplevel_dense), :call) # comment is ignored + @test Meta.isexpr(Meta.parse(toplevel_dense), :call) # comment is ignored - vector_dense = repr("text/plain", [Dense(2,3), Dense(2,3)]) - @test occursin("Dense(2 => 3)", vector_dense) - @test occursin("# 9 parameters", vector_dense) + vector_dense = repr("text/plain", [Dense(2, 3), Dense(2, 3)]) + @test occursin("Dense(2 => 3)", vector_dense) + @test occursin("# 9 parameters", vector_dense) - matrix_dense = repr("text/plain", fill(Dense(2,3), 3, 3)) - @test occursin("Dense(2 => 3)", matrix_dense) - @test !occursin("# 9 parameters", matrix_dense) + matrix_dense = repr("text/plain", fill(Dense(2, 3), 3, 3)) + @test occursin("Dense(2 => 3)", matrix_dense) + @test !occursin("# 9 parameters", matrix_dense) - tuple_dense = repr("text/plain", tuple(Dense(2,3))) - @test occursin("Dense(2 => 3)", tuple_dense) - @test !occursin("# 9 parameters", tuple_dense) + tuple_dense = repr("text/plain", tuple(Dense(2, 3))) + @test occursin("Dense(2 => 3)", tuple_dense) + @test !occursin("# 9 parameters", tuple_dense) - # Chain -- gets split over lines at top level only + # Chain -- gets split over lines at top level only - toplevel_chain = repr("text/plain", Chain(Dense(2,3))) - @test occursin("Chain(\n Dense(2 => 3)", toplevel_chain) - @test occursin("# 9 parameters", toplevel_chain) - @test !occursin("# Total:", toplevel_chain) + toplevel_chain = repr("text/plain", Chain(Dense(2, 3))) + @test occursin("Chain(\n Dense(2 => 3)", toplevel_chain) + @test occursin("# 9 parameters", toplevel_chain) + @test !occursin("# Total:", toplevel_chain) - vector_chain = repr("text/plain", [Chain(Dense(2,3)), Chain(Dense(2,3))]) - @test occursin("Chain(Dense(2 => 3))", vector_chain) - @test occursin("# 9 parameters", vector_chain) - @test !occursin("# Total:", vector_chain) + vector_chain = repr("text/plain", [Chain(Dense(2, 3)), Chain(Dense(2, 3))]) + @test occursin("Chain(Dense(2 => 3))", vector_chain) + @test occursin("# 9 parameters", vector_chain) + @test !occursin("# Total:", vector_chain) - matrix_chain = repr("text/plain", fill(Chain(Dense(2,3)), 3,3)) - @test occursin("Chain(Dense(2 => 3))", matrix_chain) - @test !occursin("# 9 parameters", matrix_chain) - @test !occursin("# Total:", matrix_chain) + matrix_chain = repr("text/plain", fill(Chain(Dense(2, 3)), 3, 3)) + @test occursin("Chain(Dense(2 => 3))", matrix_chain) + @test !occursin("# 9 parameters", matrix_chain) + @test !occursin("# Total:", matrix_chain) - # ... and only long enough chains get a total at the end: + # ... and only long enough chains get a total at the end: - longchain = Chain(Dense(2 => 3), Dense(3 => 4), Dense(4 => 5), softmax) + longchain = Chain(Dense(2 => 3), Dense(3 => 4), Dense(4 => 5), softmax) - toplevel_longchain = repr("text/plain", longchain) - @test occursin("Chain(\n Dense(2 => 3)", toplevel_longchain) - @test occursin("# 9 parameters", toplevel_longchain) - @test occursin("# Total: 6 arrays, 50 parameters", toplevel_longchain) + toplevel_longchain = repr("text/plain", longchain) + @test occursin("Chain(\n Dense(2 => 3)", toplevel_longchain) + @test occursin("# 9 parameters", toplevel_longchain) + @test occursin("# Total: 6 arrays, 50 parameters", toplevel_longchain) - vector_longchain = repr("text/plain", [longchain, longchain]) # pretty ugly in reality - @test occursin("Chain(Dense(2 => 3)", vector_longchain) - @test occursin("# 50 parameters", vector_longchain) - @test !occursin("# 9 parameters", vector_longchain) - @test !occursin("# Total:", vector_longchain) + vector_longchain = repr("text/plain", [longchain, longchain]) # pretty ugly in reality + @test occursin("Chain(Dense(2 => 3)", vector_longchain) + @test occursin("# 50 parameters", vector_longchain) + @test !occursin("# 9 parameters", vector_longchain) + @test !occursin("# Total:", vector_longchain) - matrix_longchain = repr("text/plain", fill(longchain, 3,3)) - @test occursin("Chain(Dense(2 => 3)", matrix_longchain) - @test !occursin("# 9 parameters", matrix_longchain) - @test !occursin("# Total:", matrix_longchain) + matrix_longchain = repr("text/plain", fill(longchain, 3, 3)) + @test occursin("Chain(Dense(2 => 3)", matrix_longchain) + @test !occursin("# 9 parameters", matrix_longchain) + @test !occursin("# Total:", matrix_longchain) - @test Meta.isexpr(Meta.parse(toplevel_longchain), :call) # comments are ignored - @test Meta.parse(toplevel_longchain).args[1] == :Chain + @test Meta.isexpr(Meta.parse(toplevel_longchain), :call) # comments are ignored + @test Meta.parse(toplevel_longchain).args[1] == :Chain - # Functors@0.3 marks transposed matrices non-leaf, shouldn't affect printing: - adjoint_chain = repr("text/plain", Chain([Dense([1 2; 3 4]')])) - @test occursin("Dense(2 => 2)", adjoint_chain) - @test occursin("Chain([", adjoint_chain) + # Functors@0.3 marks transposed matrices non-leaf, shouldn't affect printing: + adjoint_chain = repr("text/plain", Chain([Dense([1 2; 3 4]')])) + @test occursin("Dense(2 => 2)", adjoint_chain) + @test occursin("Chain([", adjoint_chain) end diff --git a/test/layers/stateless.jl b/test/layers/stateless.jl index 23caf06e3b..18de58ce7a 100644 --- a/test/layers/stateless.jl +++ b/test/layers/stateless.jl @@ -2,13 +2,13 @@ using Test using Flux: flatten @testset "helpers" begin - @testset "flatten" begin - x = randn(Float32, 10, 10, 3, 2) - @test size(flatten(x)) == (300, 2) - end + @testset "flatten" begin + x = randn(Float32, 10, 10, 3, 2) + @test size(flatten(x)) == (300, 2) + end - @testset "normalise" begin - x = randn(Float32, 3, 2, 2) - @test Flux.normalise(x) == Flux.normalise(x; dims=3) - end + @testset "normalise" begin + x = randn(Float32, 3, 2, 2) + @test Flux.normalise(x) == Flux.normalise(x; dims = 3) + end end diff --git a/test/layers/upsample.jl b/test/layers/upsample.jl index bc752a55a0..f6b80dcd20 100644 --- a/test/layers/upsample.jl +++ b/test/layers/upsample.jl @@ -1,87 +1,87 @@ @testset "upsample bilinear" begin - m = Upsample(:bilinear, scale=(2, 3)) - x = rand(Float32, 3, 4, 2, 3) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (6, 12, 2, 3) + m = Upsample(:bilinear, scale = (2, 3)) + x = rand(Float32, 3, 4, 2, 3) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (6, 12, 2, 3) - m = Upsample(:bilinear, scale=3) - x = rand(Float32, 3, 4, 2, 3) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (9, 12, 2, 3) + m = Upsample(:bilinear, scale = 3) + x = rand(Float32, 3, 4, 2, 3) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (9, 12, 2, 3) - m = Upsample(:bilinear, size=(4, 6)) - x = rand(Float32, 3, 4, 2, 3) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (4, 6, 2, 3) + m = Upsample(:bilinear, size = (4, 6)) + x = rand(Float32, 3, 4, 2, 3) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (4, 6, 2, 3) end @testset "upsample trilinear" begin - m = Upsample(:trilinear, scale=(2, 3, 2)) - x = rand(Float32, 3, 4, 2, 3, 4) - y = m(x) - @test y isa Array{Float32, 5} - @test size(y) == (6, 12, 4, 3, 4) + m = Upsample(:trilinear, scale = (2, 3, 2)) + x = rand(Float32, 3, 4, 2, 3, 4) + y = m(x) + @test y isa Array{Float32, 5} + @test size(y) == (6, 12, 4, 3, 4) - m = Upsample(:trilinear, scale=3) - x = rand(Float32, 3, 4, 2, 3, 4) - y = m(x) - @test y isa Array{Float32, 5} - @test size(y) == (9, 12, 6, 3, 4) + m = Upsample(:trilinear, scale = 3) + x = rand(Float32, 3, 4, 2, 3, 4) + y = m(x) + @test y isa Array{Float32, 5} + @test size(y) == (9, 12, 6, 3, 4) - m = Upsample(:trilinear, size=(4, 6, 4)) - x = rand(Float32, 3, 4, 2, 3, 4) - y = m(x) - @test y isa Array{Float32, 5} - @test size(y) == (4, 6, 4, 3, 4) + m = Upsample(:trilinear, size = (4, 6, 4)) + x = rand(Float32, 3, 4, 2, 3, 4) + y = m(x) + @test y isa Array{Float32, 5} + @test size(y) == (4, 6, 4, 3, 4) end @testset "upsample nearest" begin - x = rand(Float32, 3, 2, 3) - m = Upsample(:nearest, scale=(2,)) - y = m(x) - @test y isa Array{Float32, 3} - @test size(y) == (6, 2, 3) + x = rand(Float32, 3, 2, 3) + m = Upsample(:nearest, scale = (2,)) + y = m(x) + @test y isa Array{Float32, 3} + @test size(y) == (6, 2, 3) - x = rand(Float32, 3, 4, 2, 3) - - m = Upsample(:nearest, scale=(2, 3)) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (6, 12, 2, 3) - - m = Upsample(:nearest, scale=(2,)) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (6, 4, 2, 3) + x = rand(Float32, 3, 4, 2, 3) - m = Upsample(:nearest, scale=2) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (6, 8, 2, 3) + m = Upsample(:nearest, scale = (2, 3)) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (6, 12, 2, 3) - m = Upsample(2) - y2 = m(x) - @test y2 ≈ y + m = Upsample(:nearest, scale = (2,)) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (6, 4, 2, 3) - m = Upsample(:nearest, size=(6,8)) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (6, 8, 2, 3) + m = Upsample(:nearest, scale = 2) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (6, 8, 2, 3) + + m = Upsample(2) + y2 = m(x) + @test y2 ≈ y + + m = Upsample(:nearest, size = (6, 8)) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (6, 8, 2, 3) end @testset "PixelShuffle" begin - m = PixelShuffle(2) - x = rand(Float32, 3, 18, 3) - y = m(x) - @test y isa Array{Float32, 3} - @test size(y) == (6, 9, 3) + m = PixelShuffle(2) + x = rand(Float32, 3, 18, 3) + y = m(x) + @test y isa Array{Float32, 3} + @test size(y) == (6, 9, 3) - m = PixelShuffle(3) - x = rand(Float32, 3, 4, 18, 3) - y = m(x) - @test y isa Array{Float32, 4} - @test size(y) == (9, 12, 2, 3) + m = PixelShuffle(3) + x = rand(Float32, 3, 4, 18, 3) + y = m(x) + @test y isa Array{Float32, 4} + @test size(y) == (9, 12, 2, 3) end diff --git a/test/losses.jl b/test/losses.jl index 2ca697a657..288fddd90f 100644 --- a/test/losses.jl +++ b/test/losses.jl @@ -1,74 +1,69 @@ using Test using Flux: onehotbatch, σ -using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy, binarycrossentropy, logitbinarycrossentropy +using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy, + binarycrossentropy, logitbinarycrossentropy using Flux.Losses: xlogx, xlogy # group here all losses, used in tests const ALL_LOSSES = [Flux.Losses.mse, Flux.Losses.mae, Flux.Losses.msle, - Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy, - Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy, - Flux.Losses.kldivergence, - Flux.Losses.huber_loss, - Flux.Losses.tversky_loss, - Flux.Losses.dice_coeff_loss, - Flux.Losses.poisson_loss, - Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss, - Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, Flux.Losses.siamese_contrastive_loss] - + Flux.Losses.crossentropy, Flux.Losses.logitcrossentropy, + Flux.Losses.binarycrossentropy, Flux.Losses.logitbinarycrossentropy, + Flux.Losses.kldivergence, + Flux.Losses.huber_loss, + Flux.Losses.tversky_loss, + Flux.Losses.dice_coeff_loss, + Flux.Losses.poisson_loss, + Flux.Losses.hinge_loss, Flux.Losses.squared_hinge_loss, + Flux.Losses.binary_focal_loss, Flux.Losses.focal_loss, + Flux.Losses.siamese_contrastive_loss] @testset "xlogx & xlogy" begin - @test iszero(xlogx(0)) - @test isnan(xlogx(NaN)) - @test xlogx(2) ≈ 2.0 * log(2.0) - @inferred xlogx(2) - @inferred xlogx(0) + @test iszero(xlogx(0)) + @test isnan(xlogx(NaN)) + @test xlogx(2) ≈ 2.0 * log(2.0) + @inferred xlogx(2) + @inferred xlogx(0) - @test iszero(xlogy(0, 1)) - @test isnan(xlogy(NaN, 1)) - @test isnan(xlogy(1, NaN)) - @test isnan(xlogy(NaN, NaN)) - @test xlogy(2, 3) ≈ 2.0 * log(3.0) - @inferred xlogy(2, 3) - @inferred xlogy(0, 1) + @test iszero(xlogy(0, 1)) + @test isnan(xlogy(NaN, 1)) + @test isnan(xlogy(1, NaN)) + @test isnan(xlogy(NaN, NaN)) + @test xlogy(2, 3) ≈ 2.0 * log(3.0) + @inferred xlogy(2, 3) + @inferred xlogy(0, 1) end # First, regression-style y's y = [1, 1, 0, 0] -ŷ = [.9, .1, .1, .9] +ŷ = [0.9, 0.1, 0.1, 0.9] @testset "mse" begin - @test mse(ŷ, y) ≈ (.1^2 + .9^2)/2 + @test mse(ŷ, y) ≈ (0.1^2 + 0.9^2) / 2 - # Test that mse() loss works on complex values: - @test mse(0 + 0im, 1 + 1im) == 2 + # Test that mse() loss works on complex values: + @test mse(0 + 0im, 1 + 1im) == 2 end -@testset "mae" begin - @test Flux.mae(ŷ, y) ≈ 1/2 -end +@testset "mae" begin @test Flux.mae(ŷ, y) ≈ 1 / 2 end -@testset "huber_loss" begin - @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 -end +@testset "huber_loss" begin @test Flux.huber_loss(ŷ, y) ≈ 0.20500000000000002 end -y = [123.0,456.0,789.0] -ŷ = [345.0,332.0,789.0] -@testset "msle" begin - @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 -end +y = [123.0, 456.0, 789.0] +ŷ = [345.0, 332.0, 789.0] +@testset "msle" begin @test Flux.msle(ŷ, y) ≈ 0.38813985859136585 end # Now onehot y's y = onehotbatch([1, 1, 0, 0], 0:1) y_smoothed = label_smoothing(y, 0.1) -ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]' -v = log(.1 / .9) +ŷ = [0.1 0.9; 0.9 0.1; 0.9 0.1; 0.1 0.9]' +v = log(0.1 / 0.9) logŷ = [v 0.0; 0.0 v; 0.0 v; v 0.0]' lossvalue = 1.203972804325936 lossvalue_smoothed = 1.2039728043259348 yl = onehotbatch([1], 0:1) sf = 0.1 -yls = [sf (1-sf)]' # Effective y after label smoothing +yls = [sf (1 - sf)]' # Effective y after label smoothing ylp = [0.9 0.1]' logylp = [0.0 v]' @@ -78,113 +73,118 @@ logylp = [0.0 v]' ya = onehotbatch([1, 1, 1, 0, 0], 0:1) ya_smoothed = label_smoothing(ya, 2sf) y_same = Float32.(ya) -y_sim = y_same .* (1-2*sf) .+ sf +y_sim = y_same .* (1 - 2 * sf) .+ sf y_dis = copy(y_sim) -y_dis[1,:], y_dis[2,:] = y_dis[2,:], y_dis[1,:] +y_dis[1, :], y_dis[2, :] = y_dis[2, :], y_dis[1, :] @testset "crossentropy" begin - @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9]) - @test crossentropy(ŷ, y) ≈ lossvalue - @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed - @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*log.(ylp)) - @test crossentropy(ylp, yl) ≈ -sum(yl.*log.(ylp)) - @test iszero(crossentropy(y_same, ya, ϵ=0)) - @test iszero(crossentropy(ya, ya, ϵ=0)) - @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed) - @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed) + @test crossentropy([0.1, 0.0, 0.9], [0.1, 0.0, 0.9]) ≈ + crossentropy([0.1, 0.9], [0.1, 0.9]) + @test crossentropy(ŷ, y) ≈ lossvalue + @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed + @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls .* log.(ylp)) + @test crossentropy(ylp, yl) ≈ -sum(yl .* log.(ylp)) + @test iszero(crossentropy(y_same, ya, ϵ = 0)) + @test iszero(crossentropy(ya, ya, ϵ = 0)) + @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed) + @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed) end @testset "logitcrossentropy" begin - @test logitcrossentropy(logŷ, y) ≈ lossvalue - @test logitcrossentropy(logylp, yl) ≈ -sum(yl.*logsoftmax(logylp)) - @test logitcrossentropy(logylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*logsoftmax(logylp)) + @test logitcrossentropy(logŷ, y) ≈ lossvalue + @test logitcrossentropy(logylp, yl) ≈ -sum(yl .* logsoftmax(logylp)) + @test logitcrossentropy(logylp, label_smoothing(yl, 2sf)) ≈ + -sum(yls .* logsoftmax(logylp)) end logŷ, y = randn(3), rand(3) -yls = y.*(1-2sf).+sf +yls = y .* (1 - 2sf) .+ sf @testset "binarycrossentropy" begin - @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); ϵ=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ)) - @test binarycrossentropy(σ.(logŷ), y; ϵ=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ))) - @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ)))) - @test binarycrossentropy([0.1,0.2,0.9], 1) ≈ -mean(log, [0.1,0.2,0.9]) # constant label + @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims = 0); ϵ = 0) ≈ + -yls .* log.(σ.(logŷ)) - (1 .- yls) .* log.(1 .- σ.(logŷ)) + @test binarycrossentropy(σ.(logŷ), y; ϵ = 0) ≈ + mean(-y .* log.(σ.(logŷ)) - (1 .- y) .* log.(1 .- σ.(logŷ))) + @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y .* log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - + (1 .- y) .* log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ)))) + @test binarycrossentropy([0.1, 0.2, 0.9], 1) ≈ -mean(log, [0.1, 0.2, 0.9]) # constant label end @testset "logitbinarycrossentropy" begin - @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ=0) - @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ=0) + @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ + binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ = 0) + @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ = 0) end y = onehotbatch([1], 0:1) yls = [0.1 0.9]' @testset "label_smoothing" begin - @test label_smoothing(y, 0.2) == yls - @test label_smoothing(y, 0.2; dims=0) == label_smoothing.(y, 0.2; dims=0) - @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 1.2) - @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 0.) + @test label_smoothing(y, 0.2) == yls + @test label_smoothing(y, 0.2; dims = 0) == label_smoothing.(y, 0.2; dims = 0) + @test_throws ArgumentError label_smoothing([0.0, 0.0, 1.0, 0.0], 1.2) + @test_throws ArgumentError label_smoothing([0.0, 0.0, 1.0, 0.0], 0.0) end y = [1 2 3] ŷ = [4.0 5.0 6.0] @testset "kldivergence" begin - @test Flux.kldivergence([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ Flux.kldivergence([0.1,0.9], [0.1,0.9]) - @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457 - @test Flux.kldivergence(y, y) ≈ 0 + @test Flux.kldivergence([0.1, 0.0, 0.9], [0.1, 0.0, 0.9]) ≈ + Flux.kldivergence([0.1, 0.9], [0.1, 0.9]) + @test Flux.kldivergence(ŷ, y) ≈ -1.7661057888493457 + @test Flux.kldivergence(y, y) ≈ 0 end y = [1 2 3 4] ŷ = [5.0 6.0 7.0 8.0] @testset "hinge_loss" begin - @test Flux.hinge_loss(ŷ, y) ≈ 0 - @test Flux.hinge_loss(y, 0.5 .* y) ≈ 0.125 + @test Flux.hinge_loss(ŷ, y) ≈ 0 + @test Flux.hinge_loss(y, 0.5 .* y) ≈ 0.125 end @testset "squared_hinge_loss" begin - @test Flux.squared_hinge_loss(ŷ, y) ≈ 0 - @test Flux.squared_hinge_loss(y, 0.5 .* y) ≈ 0.0625 + @test Flux.squared_hinge_loss(ŷ, y) ≈ 0 + @test Flux.squared_hinge_loss(y, 0.5 .* y) ≈ 0.0625 end y = [0.1 0.2 0.3] ŷ = [0.4 0.5 0.6] @testset "poisson_loss" begin - @test Flux.poisson_loss(ŷ, y) ≈ 0.6278353988097339 - @test Flux.poisson_loss(y, y) ≈ 0.5044459776946685 + @test Flux.poisson_loss(ŷ, y) ≈ 0.6278353988097339 + @test Flux.poisson_loss(y, y) ≈ 0.5044459776946685 end y = [1.0 0.5 0.3 2.4] ŷ = [0 1.4 0.5 1.2] @testset "dice_coeff_loss" begin - @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999 - @test Flux.dice_coeff_loss(y, y) ≈ 0.0 + @test Flux.dice_coeff_loss(ŷ, y) ≈ 0.2799999999999999 + @test Flux.dice_coeff_loss(y, y) ≈ 0.0 end @testset "tversky_loss" begin - @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383 - @test Flux.tversky_loss(ŷ, y, β=0.8) ≈ -0.09490740740740744 - @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075 + @test Flux.tversky_loss(ŷ, y) ≈ -0.06772009029345383 + @test Flux.tversky_loss(ŷ, y, β = 0.8) ≈ -0.09490740740740744 + @test Flux.tversky_loss(y, y) ≈ -0.5576923076923075 end -@testset "no spurious promotions" begin - for T in (Float32, Float64) +@testset "no spurious promotions" begin for T in (Float32, Float64) y = rand(T, 2) ŷ = rand(T, 2) for f in ALL_LOSSES - fwd, back = Flux.pullback(f, ŷ, y) - @test fwd isa T - @test eltype(back(one(T))[1]) == T + fwd, back = Flux.pullback(f, ŷ, y) + @test fwd isa T + @test eltype(back(one(T))[1]) == T end - end -end +end end @testset "binary_focal_loss" begin - y = [0 1 0 - 1 0 1] - ŷ = [0.268941 0.5 0.268941 - 0.731059 0.5 0.731059] + y = [0 1 0 + 1 0 1] + ŷ = [0.268941 0.5 0.268941 + 0.731059 0.5 0.731059] y1 = [1 0 0 1] @@ -192,14 +192,14 @@ end 0.4 0.7] @test Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385 @test Flux.binary_focal_loss(ŷ1, y1) ≈ 0.05691642237852222 - @test Flux.binary_focal_loss(ŷ, y; γ=0.0) ≈ Flux.binarycrossentropy(ŷ, y) + @test Flux.binary_focal_loss(ŷ, y; γ = 0.0) ≈ Flux.binarycrossentropy(ŷ, y) end @testset "focal_loss" begin - y = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] - ŷ = softmax(reshape(-7:7, 3, 5) .* 1f0) + y = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] + ŷ = softmax(reshape(-7:7, 3, 5) .* 1.0f0) y1 = [1 0 0 0 0 1] @@ -208,43 +208,48 @@ end 0.1 0.3] @test Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628 @test Flux.focal_loss(ŷ1, y1) ≈ 0.45990566879720157 - @test Flux.focal_loss(ŷ, y; γ=0.0) ≈ Flux.crossentropy(ŷ, y) + @test Flux.focal_loss(ŷ, y; γ = 0.0) ≈ Flux.crossentropy(ŷ, y) end - + @testset "siamese_contrastive_loss" begin - y = [1 0 - 0 0 - 0 1] - ŷ = [0.4 0.2 - 0.5 0.5 - 0.1 0.3] - y1 = [1 0 0 0 1 - 0 1 0 1 0 - 0 0 1 0 0] - ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0) - y2 = [1 - 0 - 0 - 1 - 1] - ŷ2 = [0.6 - 0.4 - 0.1 - 0.2 - 0.7] - @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333 - @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0.5f0) ≈ 0.10000000000000002 - @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1.5f0) ≈ 0.5333333333333333 - @test Flux.siamese_contrastive_loss(ŷ1, y1) ≈ 0.32554644f0 - @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0.5f0) ≈ 0.16271012f0 - @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 1.5f0) ≈ 0.6532292f0 - @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1) ≈ Flux.siamese_contrastive_loss(ŷ, y) - @test Flux.siamese_contrastive_loss(y, y) ≈ 0.0 - @test Flux.siamese_contrastive_loss(y1, y1) ≈ 0.0 - @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0) ≈ 0.09166666666666667 - @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0) ≈ 0.13161165f0 - @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005 - @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003 - @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1, y1, margin = -0.5) - @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ, y, margin = -1) + y = [1 0 + 0 0 + 0 1] + ŷ = [0.4 0.2 + 0.5 0.5 + 0.1 0.3] + y1 = [1 0 0 0 1 + 0 1 0 1 0 + 0 0 1 0 0] + ŷ1 = softmax(reshape(-7:7, 3, 5) .* 1.0f0) + y2 = [1 + 0 + 0 + 1 + 1] + ŷ2 = [0.6 + 0.4 + 0.1 + 0.2 + 0.7] + @test Flux.siamese_contrastive_loss(ŷ, y) ≈ 0.2333333333333333 + @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0.5f0) ≈ 0.10000000000000002 + @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1.5f0) ≈ 0.5333333333333333 + @test Flux.siamese_contrastive_loss(ŷ1, y1) ≈ 0.32554644f0 + @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0.5f0) ≈ 0.16271012f0 + @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 1.5f0) ≈ 0.6532292f0 + @test Flux.siamese_contrastive_loss(ŷ, y, margin = 1) ≈ + Flux.siamese_contrastive_loss(ŷ, y) + @test Flux.siamese_contrastive_loss(y, y) ≈ 0.0 + @test Flux.siamese_contrastive_loss(y1, y1) ≈ 0.0 + @test Flux.siamese_contrastive_loss(ŷ, y, margin = 0) ≈ 0.09166666666666667 + @test Flux.siamese_contrastive_loss(ŷ1, y1, margin = 0) ≈ 0.13161165f0 + @test Flux.siamese_contrastive_loss(ŷ2, y2) ≈ 0.21200000000000005 + @test Flux.siamese_contrastive_loss(ŷ2, ŷ2) ≈ 0.18800000000000003 + @test_throws DomainError(-0.5, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ1, + y1, + margin = -0.5) + @test_throws DomainError(-1, "Margin must be non-negative") Flux.siamese_contrastive_loss(ŷ, + y, + margin = -1) end diff --git a/test/optimise.jl b/test/optimise.jl index e922d3c0b8..594789f6c2 100644 --- a/test/optimise.jl +++ b/test/optimise.jl @@ -6,199 +6,194 @@ using Test using Random @testset "Optimise" begin - # Ensure rng has different state inside and outside the inner @testset - # so that w and w' are different - Random.seed!(84) - w = randn(10, 10) - @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), - NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), - Nesterov(), RMSProp(), Momentum()] - Random.seed!(42) - w′ = randn(10, 10) - b = false - loss(x) = Flux.Losses.mse(w*x, w′*x .+ b) - for t = 1: 10^5 - θ = params([w′, b]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) + # Ensure rng has different state inside and outside the inner @testset + # so that w and w' are different + Random.seed!(84) + w = randn(10, 10) + @testset for opt in [AdamW(), AdaGrad(0.1), AdaMax(), AdaDelta(0.9), AMSGrad(), + NAdam(), RAdam(), Descent(0.1), Adam(), OAdam(), AdaBelief(), + Nesterov(), RMSProp(), Momentum()] + Random.seed!(42) + w′ = randn(10, 10) + b = false + loss(x) = Flux.Losses.mse(w * x, w′ * x .+ b) + for t in 1:(10^5) + θ = params([w′, b]) + x = rand(10) + θ̄ = gradient(() -> loss(x), θ) + Optimise.update!(opt, θ, θ̄) + end + @test loss(rand(10, 10)) < 0.01 end - @test loss(rand(10, 10)) < 0.01 - end end @testset "Optimiser" begin - Random.seed!(84) - w = randn(10, 10) - @testset for Opt in [InvDecay, WeightDecay, ExpDecay] - Random.seed!(42) - w′ = randn(10, 10) - loss(x) = Flux.Losses.mse(w*x, w′*x) - opt = Optimiser(Opt(), Adam(0.001)) - for t = 1:10^5 - θ = Params([w′]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - Optimise.update!(opt, θ, θ̄) + Random.seed!(84) + w = randn(10, 10) + @testset for Opt in [InvDecay, WeightDecay, ExpDecay] + Random.seed!(42) + w′ = randn(10, 10) + loss(x) = Flux.Losses.mse(w * x, w′ * x) + opt = Optimiser(Opt(), Adam(0.001)) + for t in 1:(10^5) + θ = Params([w′]) + x = rand(10) + θ̄ = gradient(() -> loss(x), θ) + Optimise.update!(opt, θ, θ̄) + end + @test loss(rand(10, 10)) < 0.01 end - @test loss(rand(10, 10)) < 0.01 - end end @testset "Training Loop" begin - i = 0 - l = 1 - Flux.train!( - () -> (sleep(0.1); Flux.skip(); i+=1), - Params([]), - Iterators.repeated((), 10), - Descent() - ) - - @test i==0 #all skipped - - Flux.train!( - () -> (sleep(0.1); i==8 && Flux.skip(); i+=1), - Params([]), - Iterators.repeated((), 10), - Descent() - ) - - @test i==8 #skip after i hit 8 - - i = 0 - Flux.train!(() -> (sleep(0.1); i += 1; l), - Params([]), - Iterators.repeated((), 100), - Descent(), - cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) - - @test 3 < i < 50 - - # Test multiple callbacks - x = 0 - fs = [() -> (), () -> x = 1] - cbs = runall(fs) - cbs() - @test x == 1 - - r = rand(3, 3) - loss(x) = sum(x .* x) - Flux.train!(loss, Flux.params(r), (r,), Descent()) + i = 0 + l = 1 + Flux.train!(() -> (sleep(0.1); Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent()) + + @test i == 0 #all skipped + + Flux.train!(() -> (sleep(0.1); i == 8 && Flux.skip(); i += 1), + Params([]), + Iterators.repeated((), 10), + Descent()) + + @test i == 8 #skip after i hit 8 + + i = 0 + Flux.train!(() -> (sleep(0.1); i += 1; l), + Params([]), + Iterators.repeated((), 100), + Descent(), + cb = Flux.throttle(() -> (i > 3 && Flux.stop()), 1)) + + @test 3 < i < 50 + + # Test multiple callbacks + x = 0 + fs = [() -> (), () -> x = 1] + cbs = runall(fs) + cbs() + @test x == 1 + + r = rand(3, 3) + loss(x) = sum(x .* x) + Flux.train!(loss, Flux.params(r), (r,), Descent()) end @testset "ExpDecay" begin + @testset "Sanity Check" begin + o = ExpDecay(0.2, 0.5, 1, 1e-3) + p = [0.0] + steps = 1:8 + eta_expected = @. max(o.eta * 0.5^steps, o.clip) + eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] + @test eta_actual == eta_expected + end + + @testset "starting step" begin + start = 4 + o = ExpDecay(0.2, 0.5, 1, 1e-3, start) + p = [0.0] + steps = 1:8 + eta_expected = @. max(o.eta * 0.5^max(steps - start, 0), o.clip) + eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] + @test eta_actual == eta_expected + end - @testset "Sanity Check" begin - o = ExpDecay(0.2, 0.5, 1, 1e-3) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ steps, o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - @testset "starting step" begin - start = 4 - o = ExpDecay(0.2, 0.5, 1, 1e-3, start) - p = [0.0] - steps = 1:8 - eta_expected = @. max(o.eta * 0.5 ^ max(steps - start, 0), o.clip) - eta_actual = [Optimise.apply!(o, p, [1.0])[1] for _ in steps] - @test eta_actual == eta_expected - end - - w = randn(10, 10) - o = ExpDecay(0.1, 0.1, 1000, 1e-4) - w1 = randn(10,10) - loss(x) = Flux.Losses.mse(w*x, w1*x) - flag = 1 - decay_steps = [] - for t = 1:10^5 - prev_eta = o.eta - θ = Params([w1]) - x = rand(10) - θ̄ = gradient(() -> loss(x), θ) - prev_grad = collect(θ̄[w1]) - delta = Optimise.apply!(o, w1, θ̄[w1]) - w1 .-= delta - new_eta = o.eta - if new_eta != prev_eta - push!(decay_steps, t) + w = randn(10, 10) + o = ExpDecay(0.1, 0.1, 1000, 1e-4) + w1 = randn(10, 10) + loss(x) = Flux.Losses.mse(w * x, w1 * x) + flag = 1 + decay_steps = [] + for t in 1:(10^5) + prev_eta = o.eta + θ = Params([w1]) + x = rand(10) + θ̄ = gradient(() -> loss(x), θ) + prev_grad = collect(θ̄[w1]) + delta = Optimise.apply!(o, w1, θ̄[w1]) + w1 .-= delta + new_eta = o.eta + if new_eta != prev_eta + push!(decay_steps, t) + end + array = fill(o.eta, size(prev_grad)) + if array .* prev_grad != delta + flag = 0 + end end - array = fill(o.eta, size(prev_grad)) - if array .* prev_grad != delta - flag = 0 + @test flag == 1 + # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). + ground_truth = [] + for i in 1:4 + push!(ground_truth, 1000 * i) # Expected decay steps for this example. end - end - @test flag == 1 - # Test to check if decay happens at decay steps. Eta reaches clip value (1e-4) after 4000 steps (decay by 0.1 every 1000 steps starting at 0.1). - ground_truth = [] - for i in 1:4 - push!(ground_truth, 1000*i) # Expected decay steps for this example. - end - @test decay_steps == ground_truth - @test o.eta == o.clip + @test decay_steps == ground_truth + @test o.eta == o.clip end @testset "Clipping" begin - w = randn(10, 10) - loss(x) = sum(w * x) - θ = Params([w]) - x = 1000 * randn(10) - w̄ = gradient(() -> loss(x), θ)[w] - w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄)) - @test all(w̄_value .<= 1) - w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄)) - @test norm(w̄_norm) <= 1 + w = randn(10, 10) + loss(x) = sum(w * x) + θ = Params([w]) + x = 1000 * randn(10) + w̄ = gradient(() -> loss(x), θ)[w] + w̄_value = Optimise.apply!(ClipValue(1.0), w, copy(w̄)) + @test all(w̄_value .<= 1) + w̄_norm = Optimise.apply!(ClipNorm(1.0), w, copy(w̄)) + @test norm(w̄_norm) <= 1 end @testset "update!: handle Fills from Zygote" begin - w = randn(10,10) - wold = copy(w) - g = FillArrays.Ones(size(w)) - opt = Descent(0.1) - Flux.update!(opt, w, g) - @test w ≈ wold .- 0.1 - - w = randn(3) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> w[1], θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w[1] ≈ wold[1] .- 0.1 - @test w[2:3] ≈ wold[2:3] - - ## Issue #1510 - w = randn(10,10) - wold = copy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 + w = randn(10, 10) + wold = copy(w) + g = FillArrays.Ones(size(w)) + opt = Descent(0.1) + Flux.update!(opt, w, g) + @test w ≈ wold .- 0.1 + + w = randn(3) + wold = copy(w) + θ = Flux.params([w]) + gs = gradient(() -> w[1], θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w[1] ≈ wold[1] .- 0.1 + @test w[2:3] ≈ wold[2:3] + + ## Issue #1510 + w = randn(10, 10) + wold = copy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w ≈ wold .- 0.1 end @testset "update!: handle ComponentArrays" begin - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w.a) + sum(w.c.b), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w.a ≈ wold.a .- 0.1 - @test w.b ≈ wold.b - @test w.c.b ≈ wold.c.b .- 0.1 - @test w.c.a ≈ wold.c.a - - w = ComponentArrays.ComponentArray(a=1.0, b=[2, 1, 4], c=(a=2, b=[1, 2])) - wold = deepcopy(w) - θ = Flux.params([w]) - gs = gradient(() -> sum(w), θ) - opt = Descent(0.1) - Flux.update!(opt, θ, gs) - @test w ≈ wold .- 0.1 + w = ComponentArrays.ComponentArray(a = 1.0, b = [2, 1, 4], c = (a = 2, b = [1, 2])) + wold = deepcopy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w.a) + sum(w.c.b), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w.a ≈ wold.a .- 0.1 + @test w.b ≈ wold.b + @test w.c.b ≈ wold.c.b .- 0.1 + @test w.c.a ≈ wold.c.a + + w = ComponentArrays.ComponentArray(a = 1.0, b = [2, 1, 4], c = (a = 2, b = [1, 2])) + wold = deepcopy(w) + θ = Flux.params([w]) + gs = gradient(() -> sum(w), θ) + opt = Descent(0.1) + Flux.update!(opt, θ, gs) + @test w ≈ wold .- 0.1 end # Flux PR #1776 @@ -209,15 +204,15 @@ end # wreaks all sorts of havoc on our training loops. This test ensures that # a simple optimization is montonically decreasing (up to learning step effects) @testset "Momentum Optimisers and complex values" begin - # Test every optimizer that has momentum internally - for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] +# Test every optimizer that has momentum internally +for opt_ctor in [Adam, RMSProp, RAdam, OAdam, AdaGrad, AdaDelta, NAdam, AdaBelief] # Our "model" is just a complex number w = zeros(ComplexF32, 1) # Our model attempts to learn `f(x) = conj(x)` where `f(x) = w*x` function loss() # Deterministic training data is the best training data - x = ones(1, 1) + 1im*ones(1, 1) + x = ones(1, 1) + 1im * ones(1, 1) # Manually implement `mse()` to allow demonstration of brokenness # on older Flux builds that don't have a fixed `mse()` @@ -235,5 +230,4 @@ end last_loss = loss() Flux.update!(opt, params, grads) end - end -end +end end diff --git a/test/outputsize.jl b/test/outputsize.jl index eec6880dc2..d3d8de497d 100644 --- a/test/outputsize.jl +++ b/test/outputsize.jl @@ -1,247 +1,248 @@ @testset "basic" begin - m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32)) - @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1) + m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32)) + @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1) - m = Dense(10, 5) - @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1) - @test outputsize(m, (10,); padbatch=true) == (5, 1) + m = Dense(10, 5) + @test_throws DimensionMismatch outputsize(m, (5, 2))==(5, 1) + @test outputsize(m, (10,); padbatch = true) == (5, 1) - m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2)) - @test outputsize(m, (10,); padbatch=true) == (2, 1) - @test outputsize(m, (10, 30)) == (2, 30) + m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2)) + @test outputsize(m, (10,); padbatch = true) == (2, 1) + @test outputsize(m, (10, 30)) == (2, 30) - @info "Don't mind the following error, it's for testing purpose." - m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2)) - @test_throws DimensionMismatch outputsize(m, (10,)) + @info "Don't mind the following error, it's for testing purpose." + m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2)) + @test_throws DimensionMismatch outputsize(m, (10,)) - m = Flux.Scale(10) - @test outputsize(m, (10, 1)) == (10, 1) + m = Flux.Scale(10) + @test outputsize(m, (10, 1)) == (10, 1) - m = Maxout(() -> Conv((3, 3), 3 => 16), 2) - @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) + m = Maxout(() -> Conv((3, 3), 3 => 16), 2) + @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) - m = flatten - @test outputsize(m, (5, 5, 3, 10)) == (75, 10) + m = flatten + @test outputsize(m, (5, 5, 3, 10)) == (75, 10) - m = Flux.unsqueeze(dims=3) - @test outputsize(m, (5, 7, 13)) == (5, 7, 1, 13) + m = Flux.unsqueeze(dims = 3) + @test outputsize(m, (5, 7, 13)) == (5, 7, 1, 13) - m = Flux.Bilinear(10, 10, 7) - @test outputsize(m, (10,)) == (7,) - @test outputsize(m, (10, 32)) == (7, 32) + m = Flux.Bilinear(10, 10, 7) + @test outputsize(m, (10,)) == (7,) + @test outputsize(m, (10, 32)) == (7, 32) - m = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), flatten, Dense(1024, 10)) - @test outputsize(m, (10, 10, 3, 50)) == (10, 50) - @test outputsize(m, (10, 10, 3, 2)) == (10, 2) + m = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), flatten, Dense(1024, 10)) + @test outputsize(m, (10, 10, 3, 50)) == (10, 50) + @test outputsize(m, (10, 10, 3, 2)) == (10, 2) - m = SkipConnection(Conv((3, 3), 3 => 16; pad = 1), (mx, x) -> cat(mx, x; dims = 3)) - @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1) + m = SkipConnection(Conv((3, 3), 3 => 16; pad = 1), (mx, x) -> cat(mx, x; dims = 3)) + @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1) - m = Parallel((mx, x) -> cat(mx, x; dims = 3), Conv((3, 3), 3 => 16; pad = 1), identity) - @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1) + m = Parallel((mx, x) -> cat(mx, x; dims = 3), Conv((3, 3), 3 => 16; pad = 1), identity) + @test outputsize(m, (10, 10, 3, 1)) == (10, 10, 19, 1) end @testset "multiple inputs" begin - m = Parallel(vcat, Dense(2, 4, relu), Dense(3, 6, relu)) - @test outputsize(m, (2,), (3,)) == (10,) - @test outputsize(m, ((2,), (3,))) == (10,) - @test outputsize(m, (2,), (3,); padbatch=true) == (10, 1) - @test outputsize(m, (2,7), (3,7)) == (10, 7) - - m = Chain(m, Dense(10, 13, tanh), softmax) - @test outputsize(m, (2,), (3,)) == (13,) - @test outputsize(m, ((2,), (3,))) == (13,) - @test outputsize(m, (2,), (3,); padbatch=true) == (13, 1) - @test outputsize(m, (2,7), (3,7)) == (13, 7) + m = Parallel(vcat, Dense(2, 4, relu), Dense(3, 6, relu)) + @test outputsize(m, (2,), (3,)) == (10,) + @test outputsize(m, ((2,), (3,))) == (10,) + @test outputsize(m, (2,), (3,); padbatch = true) == (10, 1) + @test outputsize(m, (2, 7), (3, 7)) == (10, 7) + + m = Chain(m, Dense(10, 13, tanh), softmax) + @test outputsize(m, (2,), (3,)) == (13,) + @test outputsize(m, ((2,), (3,))) == (13,) + @test outputsize(m, (2,), (3,); padbatch = true) == (13, 1) + @test outputsize(m, (2, 7), (3, 7)) == (13, 7) end -@testset "activations" begin - @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh, - leakyrelu, lisht, logcosh, logσ, mish, - relu, relu6, rrelu, selu, σ, softplus, - softshrink, softsign, swish, tanhshrink, trelu] +@testset "activations" begin @testset for f in [celu, elu, gelu, hardsigmoid, hardtanh, + leakyrelu, lisht, logcosh, logσ, mish, + relu, relu6, rrelu, selu, σ, softplus, + softshrink, softsign, swish, tanhshrink, trelu] @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1) - end -end +end end @testset "conv" begin - m = Conv((3, 3), 3 => 16) - @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) - m = Conv((3, 3), 3 => 16; stride = 2) - @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1) - m = Conv((3, 3), 3 => 16; stride = 2, pad = 3) - @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) - m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) - @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1) - @test_throws DimensionMismatch outputsize(m, (5, 5, 2)) - @test outputsize(m, (5, 5, 3, 100)) == (4, 4, 16, 100) - - m = ConvTranspose((3, 3), 3 => 16) - @test outputsize(m, (8, 8, 3, 1)) == (10, 10, 16, 1) - m = ConvTranspose((3, 3), 3 => 16; stride = 2) - @test outputsize(m, (2, 2, 3, 1)) == (5, 5, 16, 1) - m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3) - @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) - m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) - @test outputsize(m, (4, 4, 3, 1)) == (5, 5, 16, 1) - - m = DepthwiseConv((3, 3), 3 => 6) - @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 6, 1) - m = DepthwiseConv((3, 3), 3 => 6; stride = 2) - @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 6, 1) - m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3) - @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 6, 1) - m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2) - @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 6, 1) - - m = CrossCor((3, 3), 3 => 16) - @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) - m = CrossCor((3, 3), 3 => 16; stride = 2) - @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1) - m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3) - @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) - m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) - @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1) - - m = AdaptiveMaxPool((2, 2)) - @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1) - - m = AdaptiveMeanPool((2, 2)) - @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1) - - m = GlobalMaxPool() - @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1) - - m = GlobalMeanPool() - @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1) - - m = MaxPool((2, 2)) - @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1) - m = MaxPool((2, 2); stride = 1) - @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1) - m = MaxPool((2, 2); stride = 2, pad = 3) - @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1) - - m = MeanPool((2, 2)) - @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1) - m = MeanPool((2, 2); stride = 1) - @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1) - m = MeanPool((2, 2); stride = 2, pad = 3) - @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1) + m = Conv((3, 3), 3 => 16) + @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) + m = Conv((3, 3), 3 => 16; stride = 2) + @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1) + m = Conv((3, 3), 3 => 16; stride = 2, pad = 3) + @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) + m = Conv((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) + @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1) + @test_throws DimensionMismatch outputsize(m, (5, 5, 2)) + @test outputsize(m, (5, 5, 3, 100)) == (4, 4, 16, 100) + + m = ConvTranspose((3, 3), 3 => 16) + @test outputsize(m, (8, 8, 3, 1)) == (10, 10, 16, 1) + m = ConvTranspose((3, 3), 3 => 16; stride = 2) + @test outputsize(m, (2, 2, 3, 1)) == (5, 5, 16, 1) + m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3) + @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) + m = ConvTranspose((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) + @test outputsize(m, (4, 4, 3, 1)) == (5, 5, 16, 1) + + m = DepthwiseConv((3, 3), 3 => 6) + @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 6, 1) + m = DepthwiseConv((3, 3), 3 => 6; stride = 2) + @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 6, 1) + m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3) + @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 6, 1) + m = DepthwiseConv((3, 3), 3 => 6; stride = 2, pad = 3, dilation = 2) + @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 6, 1) + + m = CrossCor((3, 3), 3 => 16) + @test outputsize(m, (10, 10, 3, 1)) == (8, 8, 16, 1) + m = CrossCor((3, 3), 3 => 16; stride = 2) + @test outputsize(m, (5, 5, 3, 1)) == (2, 2, 16, 1) + m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3) + @test outputsize(m, (5, 5, 3, 1)) == (5, 5, 16, 1) + m = CrossCor((3, 3), 3 => 16; stride = 2, pad = 3, dilation = 2) + @test outputsize(m, (5, 5, 3, 1)) == (4, 4, 16, 1) + + m = AdaptiveMaxPool((2, 2)) + @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1) + + m = AdaptiveMeanPool((2, 2)) + @test outputsize(m, (10, 10, 3, 1)) == (2, 2, 3, 1) + + m = GlobalMaxPool() + @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1) + + m = GlobalMeanPool() + @test outputsize(m, (10, 10, 3, 1)) == (1, 1, 3, 1) + + m = MaxPool((2, 2)) + @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1) + m = MaxPool((2, 2); stride = 1) + @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1) + m = MaxPool((2, 2); stride = 2, pad = 3) + @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1) + + m = MeanPool((2, 2)) + @test outputsize(m, (10, 10, 3, 1)) == (5, 5, 3, 1) + m = MeanPool((2, 2); stride = 1) + @test outputsize(m, (5, 5, 4, 1)) == (4, 4, 4, 1) + m = MeanPool((2, 2); stride = 2, pad = 3) + @test outputsize(m, (5, 5, 2, 1)) == (5, 5, 2, 1) end @testset "normalisation" begin - m = Dropout(0.1) - @test outputsize(m, (10, 10)) == (10, 10) - @test outputsize(m, (10,); padbatch=true) == (10, 1) - - m = AlphaDropout(0.1) - @test outputsize(m, (10, 10)) == (10, 10) - @test outputsize(m, (10,); padbatch=true) == (10, 1) - - m = LayerNorm(32) - @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) - @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) - m2 = LayerNorm(3, 2) - @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2))) - @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2))) - - m = BatchNorm(3) - @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) - @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) - @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) - @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) - - m = InstanceNorm(3) - @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) - @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1) - @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) - @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) - - m = GroupNorm(16, 4) - @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16) - @test outputsize(m, (32, 32, 16); padbatch=true) == (32, 32, 16, 1) - @test_throws Exception m(randn(Float32, 32, 32, 15, 4)) - @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4)) + m = Dropout(0.1) + @test outputsize(m, (10, 10)) == (10, 10) + @test outputsize(m, (10,); padbatch = true) == (10, 1) + + m = AlphaDropout(0.1) + @test outputsize(m, (10, 10)) == (10, 10) + @test outputsize(m, (10,); padbatch = true) == (10, 1) + + m = LayerNorm(32) + @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) + @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1) + m2 = LayerNorm(3, 2) + @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2))) + @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2))) + + m = BatchNorm(3) + @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) + @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1) + @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) + + m = InstanceNorm(3) + @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16) + @test outputsize(m, (32, 32, 3); padbatch = true) == (32, 32, 3, 1) + @test_throws Exception m(randn(Float32, 32, 32, 5, 1)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 5, 1)) + + m = GroupNorm(16, 4) + @test outputsize(m, (32, 32, 16, 16)) == (32, 32, 16, 16) + @test outputsize(m, (32, 32, 16); padbatch = true) == (32, 32, 16, 1) + @test_throws Exception m(randn(Float32, 32, 32, 15, 4)) + @test_throws DimensionMismatch outputsize(m, (32, 32, 15, 4)) end @testset "autosize macro" begin - m = @autosize (3,) Dense(_ => 4) - @test randn(3) |> m |> size == (4,) - - m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax) - @test randn(3, 5) |> m |> size == (10, 5) - - m = @autosize (2, 3, 4, 5) Dense(_ => 10) # goes by first dim, not 2nd-last - @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5) - - m = @autosize (9,) Dense(_ => div(_,2)) - @test randn(9) |> m |> size == (4,) - - m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax) # needs kw - @test randn(3) |> m |> size == (4,) - - m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2) # needs ->, block - @test randn(3, 45) |> m |> size == (6, 45) - - # here Parallel gets two inputs, no problem: - m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4), Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), Flux.Scale(_)) - @test randn(3) |> m |> size == (11,) - - # like Dense, LayerNorm goes by the first dimension: - m = @autosize (3, 4, 5) LayerNorm(_) - @test rand(3, 6, 7) |> m |> size == (3, 6, 7) - - m = @autosize (3, 3, 10) LayerNorm(_, _) # does not check that sizes match - @test rand(3, 3, 10) |> m |> size == (3, 3, 10) - - m = @autosize (3,) Flux.Bilinear(_ => 10) - @test randn(3) |> m |> size == (10,) - - m = @autosize (3, 1) Flux.Bilinear(_ => 10) - @test randn(3, 4) |> m |> size == (10, 4) - - m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10)) # Bilinear gets two inputs - @test randn(3, 4) |> m |> size == (10, 4) - - @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_,3) => 10) - - # first docstring example - m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine=false)) - @test randn(3, 4) |> m |> size == (2, 4) - - # evil docstring example - img = [28, 28]; - m = @autosize (img..., 1, 32) Chain( # size is only needed at runtime - Chain(c = Conv((3,3), _ => 5; stride=2, pad=SamePad()), - p = MeanPool((3,3)), - b = BatchNorm(_), - f = Flux.flatten), - Dense(_ => _÷4, relu, init=Flux.rand32), # can calculate output size _÷4 - SkipConnection(Dense(_ => _, relu), +), - Dense(_ => 10), - ) - @test randn(Float32, img..., 1, 32) |> m |> size == (10, 32) + m = @autosize (3,) Dense(_ => 4) + @test randn(3) |> m |> size == (4,) + + m = @autosize (3, 1) Chain(Dense(_, 4), Dense(4 => 10), softmax) + @test randn(3, 5) |> m |> size == (10, 5) + + m = @autosize (2, 3, 4, 5) Dense(_ => 10) # goes by first dim, not 2nd-last + @test randn(2, 3, 4, 5) |> m |> size == (10, 3, 4, 5) + + m = @autosize (9,) Dense(_ => div(_, 2)) + @test randn(9) |> m |> size == (4,) + + m = @autosize (3,) Chain(one = Dense(_ => 4), two = softmax) # needs kw + @test randn(3) |> m |> size == (4,) + + m = @autosize (3, 45) Maxout(() -> Dense(_ => 6, tanh), 2) # needs ->, block + @test randn(3, 45) |> m |> size == (6, 45) + + # here Parallel gets two inputs, no problem: + m = @autosize (3,) Chain(SkipConnection(Dense(_ => 4), + Parallel(vcat, Dense(_ => 5), Dense(_ => 6))), + Flux.Scale(_)) + @test randn(3) |> m |> size == (11,) + + # like Dense, LayerNorm goes by the first dimension: + m = @autosize (3, 4, 5) LayerNorm(_) + @test rand(3, 6, 7) |> m |> size == (3, 6, 7) + + m = @autosize (3, 3, 10) LayerNorm(_, _) # does not check that sizes match + @test rand(3, 3, 10) |> m |> size == (3, 3, 10) + + m = @autosize (3,) Flux.Bilinear(_ => 10) + @test randn(3) |> m |> size == (10,) + + m = @autosize (3, 1) Flux.Bilinear(_ => 10) + @test randn(3, 4) |> m |> size == (10, 4) + + m = @autosize (3,) SkipConnection(Dense(_ => _), Flux.Bilinear(_ => 10)) # Bilinear gets two inputs + @test randn(3, 4) |> m |> size == (10, 4) + + @test_throws Exception @eval @autosize (3,) Flux.Bilinear((_, 3) => 10) + + # first docstring example + m = @autosize (3, 1) Chain(Dense(_ => 2, sigmoid), BatchNorm(_, affine = false)) + @test randn(3, 4) |> m |> size == (2, 4) + + # evil docstring example + img = [28, 28] + m = @autosize (img..., 1, 32) Chain(Chain(c = Conv((3, 3), _ => 5; stride = 2, + pad = SamePad()), + p = MeanPool((3, 3)), + b = BatchNorm(_), + f = Flux.flatten), + Dense(_ => _ ÷ 4, relu, init = Flux.rand32), # can calculate output size _÷4 + SkipConnection(Dense(_ => _, relu), +), + Dense(_ => 10)) + @test randn(Float32, img..., 1, 32) |> m |> size == (10, 32) end @testset "LazyLayer" begin - # This is what `@autosize` uses, ideally nobody should make these by hand! - # Implicitly testeed by the macro, explicitly here too: - ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)", x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init=ones), nothing) + # This is what `@autosize` uses, ideally nobody should make these by hand! + # Implicitly testeed by the macro, explicitly here too: + ld = Flux.LazyLayer("Dense(_ => 3, relu; init=??)", + x -> Dense(Flux.autosizefor(Dense, x) => 3, relu, init = ones), + nothing) + + lm = Chain(ld, Flux.Scale(3)) + @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))" + @test_throws Exception Flux.striplazy(lm) - lm = Chain(ld, Flux.Scale(3)) - @test string(ld) == "LazyLayer(Dense(_ => 3, relu; init=??))" - @test_throws Exception Flux.striplazy(lm) + @test lm([1, 2]) == [3, 3, 3] - @test lm([1,2]) == [3,3,3] + @test string(ld) == "LazyLayer(Dense(2 => 3, relu))" + @test Flux.striplazy(ld) isa Dense - @test string(ld) == "LazyLayer(Dense(2 => 3, relu))" - @test Flux.striplazy(ld) isa Dense + @test_throws Exception Flux.params(lm) + @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1, 2]) + @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1, 2])), ld) - @test_throws Exception Flux.params(lm) - @test_throws Exception gradient(x -> sum(abs2, lm(x)), [1,2]) - @test_throws Exception gradient(m -> sum(abs2, Flux.striplazy(m)([1,2])), ld) - - # Can't let |> gpu act before the arrays are materialized... so it's an error: - @test_throws ErrorException @eval @autosize (1,2,3) Dense(_=>2) |> f64 + # Can't let |> gpu act before the arrays are materialized... so it's an error: + @test_throws ErrorException @eval @autosize (1, 2, 3) Dense(_ => 2)|>f64 end diff --git a/test/runtests.jl b/test/runtests.jl index 9027b114fc..4189ea0dd5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,53 +11,44 @@ using CUDA Random.seed!(0) @testset verbose=true "Flux.jl" begin + @testset "Utils" begin include("utils.jl") end - @testset "Utils" begin - include("utils.jl") - end - - @testset "Optimise" begin - include("optimise.jl") - end - - @testset "Data" begin - include("data.jl") - end - - @testset "Losses" begin - include("losses.jl") - include("ctc.jl") - CUDA.functional() && include("ctc-gpu.jl") - end - - @testset "Layers" begin - include("layers/basic.jl") - include("layers/normalisation.jl") - include("layers/stateless.jl") - include("layers/recurrent.jl") - include("layers/conv.jl") - include("layers/upsample.jl") - include("layers/show.jl") - end - - @testset "outputsize" begin - using Flux: outputsize - include("outputsize.jl") - end - - @testset "CUDA" begin - if CUDA.functional() - include("cuda/runtests.jl") - else - @warn "CUDA unavailable, not testing GPU support" + @testset "Optimise" begin include("optimise.jl") end + + @testset "Data" begin include("data.jl") end + + @testset "Losses" begin + include("losses.jl") + include("ctc.jl") + CUDA.functional() && include("ctc-gpu.jl") + end + + @testset "Layers" begin + include("layers/basic.jl") + include("layers/normalisation.jl") + include("layers/stateless.jl") + include("layers/recurrent.jl") + include("layers/conv.jl") + include("layers/upsample.jl") + include("layers/show.jl") + end + + @testset "outputsize" begin + using Flux: outputsize + include("outputsize.jl") end - end - @static if VERSION == v"1.6" - using Documenter - @testset "Docs" begin - DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive=true) - doctest(Flux) + @testset "CUDA" begin if CUDA.functional() + include("cuda/runtests.jl") + else + @warn "CUDA unavailable, not testing GPU support" + end end + + @static if VERSION == v"1.6" + using Documenter + @testset "Docs" begin + DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true) + doctest(Flux) + end end - end end diff --git a/test/utils.jl b/test/utils.jl index 20359daf25..3607250883 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,8 +1,8 @@ using Flux using Flux: throttle, nfan, glorot_uniform, glorot_normal, - kaiming_normal, kaiming_uniform, orthogonal, truncated_normal, - sparse_init, identity_init, unstack, batch, unbatch, - unsqueeze, params, loadparams!, loadmodel! + kaiming_normal, kaiming_uniform, orthogonal, truncated_normal, + sparse_init, identity_init, unstack, batch, unbatch, + unsqueeze, params, loadparams!, loadmodel! using MLUtils using StatsBase: var, std using Statistics, LinearAlgebra @@ -10,798 +10,798 @@ using Random using Test @testset "Throttle" begin - @testset "default behaviour" begin - a = [] - f = throttle(()->push!(a, time()), 1, leading=true, trailing=false) - f() - f() - f() - sleep(1.01) - @test length(a) == 1 - end - - @testset "leading behaviour" begin - a = [] - f = throttle(()->push!(a, time()), 1, leading=true, trailing=false) - f() - @test length(a) == 1 - f() - @test length(a) == 1 - sleep(1.01) - f() - @test length(a) == 2 - end - - @testset "trailing behaviour" begin - a = [] - f = throttle(()->push!(a, time()), 1, leading=false, trailing=true) - f() - @test length(a) == 0 - f() - @test length(a) == 0 - sleep(1.01) - @test length(a) == 1 - end - - @testset "arguments" begin - a = [] - f = throttle((x)->push!(a, x), 1, leading=true, trailing=true) - f(1) - @test a == [1] - f(2) - @test a == [1] - f(3) - @test a == [1] - sleep(1.01) - @test a == [1, 3] - end + @testset "default behaviour" begin + a = [] + f = throttle(() -> push!(a, time()), 1, leading = true, trailing = false) + f() + f() + f() + sleep(1.01) + @test length(a) == 1 + end + + @testset "leading behaviour" begin + a = [] + f = throttle(() -> push!(a, time()), 1, leading = true, trailing = false) + f() + @test length(a) == 1 + f() + @test length(a) == 1 + sleep(1.01) + f() + @test length(a) == 2 + end + + @testset "trailing behaviour" begin + a = [] + f = throttle(() -> push!(a, time()), 1, leading = false, trailing = true) + f() + @test length(a) == 0 + f() + @test length(a) == 0 + sleep(1.01) + @test length(a) == 1 + end + + @testset "arguments" begin + a = [] + f = throttle((x) -> push!(a, x), 1, leading = true, trailing = true) + f(1) + @test a == [1] + f(2) + @test a == [1] + f(3) + @test a == [1] + sleep(1.01) + @test a == [1, 3] + end end @testset "Initialization" begin - # Set random seed so that these tests don't fail randomly - Random.seed!(0) - - @testset "Fan in/out" begin - @test nfan() == (1, 1) #For a constant - @test nfan(100) == (1, 100) #For vector - @test nfan(100, 200) == (200, 100) == nfan((100, 200)) #For Dense layer - @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer - @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer - @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer - end - - @testset "Basics: $init" for init in [ - glorot_uniform, glorot_normal, - kaiming_uniform, kaiming_normal, - orthogonal, - sparse_init, - truncated_normal, - identity_init, - Flux.rand32, - Flux.randn32, + # Set random seed so that these tests don't fail randomly + Random.seed!(0) + + @testset "Fan in/out" begin + @test nfan() == (1, 1) #For a constant + @test nfan(100) == (1, 100) #For vector + @test nfan(100, 200) == (200, 100) == nfan((100, 200)) #For Dense layer + @test nfan(2, 30, 40) == (2 * 30, 2 * 40) #For 1D Conv layer + @test nfan(2, 3, 40, 50) == (2 * 3 * 40, 2 * 3 * 50) #For 2D Conv layer + @test nfan(2, 3, 4, 50, 60) == (2 * 3 * 4 * 50, 2 * 3 * 4 * 60) #For 3D Conv layer + end + + @testset "Basics: $init" for init in [ + glorot_uniform, glorot_normal, + kaiming_uniform, kaiming_normal, + orthogonal, + sparse_init, + truncated_normal, + identity_init, + Flux.rand32, + Flux.randn32, ] - if init == sparse_init - init = (args...) -> sparse_init(args...; sparsity=0.5) - else - # sparse_init is the only one which accepts only matrices: - @test size(init(3)) == (3,) - @test size(init(3, 4, 5)) == (3, 4, 5) - end - @test size(init(3, 4)) == (3, 4) - # only init(size...) is accepted: - @test_throws MethodError size(init((3, 4, 5))) == (3, 4, 5) - - # rng, and currying: - @test size(init(MersenneTwister(1), 3, 4)) == (3, 4) - closure = init(MersenneTwister(1)) - @test size(closure(3, 4)) == (3, 4) - - # eltype, default Float32 - @test eltype(init(3, 4)) == Float32 - - # @non_differentiable - @test gradient(x -> sum(x .* init(3, 4)), 5.0)[1] isa Number - end - - @testset "glorot: $init" for init ∈ [glorot_uniform, glorot_normal] - # glorot_uniform and glorot_normal should both yield a kernel with - # variance ≈ 2/(fan_in + fan_out) - for dims ∈ [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)] - v = init(dims...) - fan_in, fan_out = nfan(dims...) - σ2 = 2 / (fan_in + fan_out) - @test 0.9σ2 < var(v) < 1.1σ2 - end - @test eltype(init(3, 4; gain=1.5)) == Float32 - end - - @testset "kaiming" begin - # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)] - # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out) - for (n_in, n_out) in [(100, 100), (100, 400)] - v = kaiming_uniform(n_in, n_out) - σ2 = sqrt(6/n_out) - @test -1σ2 < minimum(v) < -0.9σ2 - @test 0.9σ2 < maximum(v) < 1σ2 - - v = kaiming_normal(n_in, n_out) - σ2 = sqrt(2/n_out) - @test 0.9σ2 < std(v) < 1.1σ2 - end - @test eltype(kaiming_uniform(3, 4; gain=1.5)) == Float32 - @test eltype(kaiming_normal(3, 4; gain=1.5)) == Float32 - end - - @testset "orthogonal" begin - # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition. - for (rows,cols) in [(5,3),(3,5)] - v = orthogonal(rows, cols) - rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) - end - for mat in [(3,4,5),(2,2,5)] - v = orthogonal(mat...) - cols = mat[end] - rows = div(prod(mat),cols) - v = reshape(v, (rows,cols)) - rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) - end - @test eltype(orthogonal(3, 4; gain=1.5)) == Float32 - end - - @testset "sparse_init" begin - # sparse_init should yield an error for non 2-d dimensions - # sparse_init should yield no zero elements if sparsity < 0 - # sparse_init should yield all zero elements if sparsity > 1 - # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values - # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter - - @test_throws ArgumentError sparse_init(100, 100, 100, sparsity=0.1) - v = sparse_init(100, 100, sparsity=-0.1) - @test sum(v .== 0) == 0 - v = sparse_init(100, 100, sparsity=1.1) - @test sum(v .== 0) == length(v) - - for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] - expected_zeros = ceil(Integer, n_in * sparsity) - v = sparse_init(n_in, n_out, sparsity=sparsity, std=σ) - @test all([sum(v[:,col] .== 0) == expected_zeros for col in 1:n_out]) - @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ + if init == sparse_init + init = (args...) -> sparse_init(args...; sparsity = 0.5) + else + # sparse_init is the only one which accepts only matrices: + @test size(init(3)) == (3,) + @test size(init(3, 4, 5)) == (3, 4, 5) + end + @test size(init(3, 4)) == (3, 4) + # only init(size...) is accepted: + @test_throws MethodError size(init((3, 4, 5)))==(3, 4, 5) + + # rng, and currying: + @test size(init(MersenneTwister(1), 3, 4)) == (3, 4) + closure = init(MersenneTwister(1)) + @test size(closure(3, 4)) == (3, 4) + + # eltype, default Float32 + @test eltype(init(3, 4)) == Float32 + + # @non_differentiable + @test gradient(x -> sum(x .* init(3, 4)), 5.0)[1] isa Number + end + + @testset "glorot: $init" for init in [glorot_uniform, glorot_normal] + # glorot_uniform and glorot_normal should both yield a kernel with + # variance ≈ 2/(fan_in + fan_out) + for dims in [(1000,), (100, 100), (100, 400), (2, 3, 32, 64), (2, 3, 4, 32, 64)] + v = init(dims...) + fan_in, fan_out = nfan(dims...) + σ2 = 2 / (fan_in + fan_out) + @test 0.9σ2 < var(v) < 1.1σ2 + end + @test eltype(init(3, 4; gain = 1.5)) == Float32 + end + + @testset "kaiming" begin + # kaiming_uniform should yield a kernel in range [-sqrt(6/n_out), sqrt(6/n_out)] + # and kaiming_normal should yield a kernel with stddev ~= sqrt(2/n_out) + for (n_in, n_out) in [(100, 100), (100, 400)] + v = kaiming_uniform(n_in, n_out) + σ2 = sqrt(6 / n_out) + @test -1σ2 < minimum(v) < -0.9σ2 + @test 0.9σ2 < maximum(v) < 1σ2 + + v = kaiming_normal(n_in, n_out) + σ2 = sqrt(2 / n_out) + @test 0.9σ2 < std(v) < 1.1σ2 + end + @test eltype(kaiming_uniform(3, 4; gain = 1.5)) == Float32 + @test eltype(kaiming_normal(3, 4; gain = 1.5)) == Float32 + end + + @testset "orthogonal" begin + # A matrix of dim = (m,n) with m > n should produce a QR decomposition. In the other case, the transpose should be taken to compute the QR decomposition. + for (rows, cols) in [(5, 3), (3, 5)] + v = orthogonal(rows, cols) + rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + for mat in [(3, 4, 5), (2, 2, 5)] + v = orthogonal(mat...) + cols = mat[end] + rows = div(prod(mat), cols) + v = reshape(v, (rows, cols)) + rows < cols ? (@test v * v' ≈ I(rows)) : (@test v' * v ≈ I(cols)) + end + @test eltype(orthogonal(3, 4; gain = 1.5)) == Float32 + end + + @testset "sparse_init" begin + # sparse_init should yield an error for non 2-d dimensions + # sparse_init should yield no zero elements if sparsity < 0 + # sparse_init should yield all zero elements if sparsity > 1 + # sparse_init should yield exactly ceil(n_in * sparsity) elements in each column for other sparsity values + # sparse_init should yield a kernel in its non-zero elements consistent with the std parameter + + @test_throws ArgumentError sparse_init(100, 100, 100, sparsity = 0.1) + v = sparse_init(100, 100, sparsity = -0.1) + @test sum(v .== 0) == 0 + v = sparse_init(100, 100, sparsity = 1.1) + @test sum(v .== 0) == length(v) + + for (n_in, n_out, sparsity, σ) in [(100, 100, 0.25, 0.1), (100, 400, 0.75, 0.01)] + expected_zeros = ceil(Integer, n_in * sparsity) + v = sparse_init(n_in, n_out, sparsity = sparsity, std = σ) + @test all([sum(v[:, col] .== 0) == expected_zeros for col in 1:n_out]) + @test 0.9 * σ < std(v[v .!= 0]) < 1.1 * σ + end + + @test eltype(sparse_init(3, 4; std = 1.5, sparsity = 0.5)) == Float32 + end + + @testset "truncated_normal" begin + m = truncated_normal(100, 100) + @test minimum(m)≈-2 atol=0.05 # default arguments + @test maximum(m)≈2 atol=0.05 + @test mean(m)≈0 atol=0.1 + + size100 = (100, 100, 100) + for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)] + v = truncated_normal(size100...; mean = μ, std = σ, lo, hi) + @test isapprox(mean(v), μ; atol = 1.0f-1) + @test isapprox(minimum(v), lo; atol = 1.0f-2) + @test isapprox(maximum(v), hi; atol = 1.0f-2) + @test eltype(v) == Float32 # despite some Float64 arguments + end + for (μ, σ, lo, hi) in [(6, 2, -100.0, 100), (-7.0, 10, -100, 100)] + v = truncated_normal(size100...; mean = μ, std = σ, lo, hi) + @test isapprox(mean(v), μ; atol = 1.0f-1) + @test isapprox(std(v), σ; atol = 1.0f-1) + end + end + + @testset "Partial application" begin + partial_ku = kaiming_uniform(gain = 1e9) + @test maximum(partial_ku(8, 8)) > 1e9 / 2 + @test maximum(partial_ku(8, 8, gain = 1)) < 1e9 / 2 + + partial_kn = kaiming_normal(gain = 1e9) + @test maximum(partial_kn(8, 8)) > 1e9 / 2 + @test maximum(partial_kn(8, 8, gain = 1)) < 1e9 / 2 + + partial_si = sparse_init(sparsity = 1) + @test maximum(partial_si(8, 8)) == 0 + @test maximum(partial_si(8, 8, sparsity = 0)) > 0 + end + + @testset "identity_init" begin + @testset "Basic" begin + partial = identity_init(gain = 3) + @test partial(3, 3) == identity_init(3, 3; gain = 3) == [3 0 0; 0 3 0; 0 0 3] + @test eltype(identity_init(3, 4; gain = 1.5)) == Float32 # despite Float64 keyword + end + @testset "Non-identity sizes" begin + @test identity_init(2, 3)[:, end] == zeros(Float32, 2) + @test identity_init(3, 2; shift = 1)[1, :] == zeros(Float32, 2) + @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3) + @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3) + @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3) + end + @testset "Dense ID mapping" begin + l = Dense(3, 3, init = identity_init) + + indata = reshape(collect(Float32, 1:9), 3, 3) + @test l(indata) == indata + end + @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv, + ConvTranspose, + CrossCor), + kernelsize in ((1,), + (3,), + (1, 3), + (3, 5), + (3, 5, + 7)) + + nch = 3 + l = layer(kernelsize, nch => nch, init = identity_init, pad = SamePad()) + + indata = randn(Float32, kernelsize..., nch, nch) + @test l(indata) == indata + end + @testset "Inception identity" begin + insize = 7 + path1 = Conv((1, 3), insize => 2; init = identity_init, pad = SamePad()) + path2 = Conv((3, 5), insize => 3; init = identity_init(shift = (0, 0, 2, 0)), + pad = SamePad()) + path3 = Conv((5, 7), insize => 2; init = identity_init(shift = (0, 0, 5, 0)), + pad = SamePad()) + block = Parallel((xs...) -> cat(xs...; dims = 3), path1, path2, path3) + + indata = randn(Float32, 9, 9, 7, 2) + @test block(indata) == indata + end end - - @test eltype(sparse_init(3, 4; std=1.5, sparsity=0.5)) == Float32 - end - - @testset "truncated_normal" begin - m = truncated_normal(100, 100) - @test minimum(m) ≈ -2 atol = 0.05 # default arguments - @test maximum(m) ≈ 2 atol = 0.05 - @test mean(m) ≈ 0 atol = 0.1 - - size100 = (100, 100, 100) - for (μ, σ, lo, hi) in [(0.0, 1, -2, 3), (1, 2, -4.0, 5.0)] - v = truncated_normal(size100...; mean = μ, std = σ, lo, hi) - @test isapprox(mean(v), μ; atol = 1f-1) - @test isapprox(minimum(v), lo; atol = 1f-2) - @test isapprox(maximum(v), hi; atol = 1f-2) - @test eltype(v) == Float32 # despite some Float64 arguments - end - for (μ, σ, lo, hi) in [(6, 2, -100.0, 100), (-7.0, 10, -100, 100)] - v = truncated_normal(size100...; mean = μ, std = σ, lo, hi) - @test isapprox(mean(v), μ; atol = 1f-1) - @test isapprox(std(v), σ; atol = 1f-1) - end - end - - @testset "Partial application" begin - partial_ku = kaiming_uniform(gain=1e9) - @test maximum(partial_ku(8, 8)) > 1e9 / 2 - @test maximum(partial_ku(8, 8, gain=1)) < 1e9 / 2 - - partial_kn = kaiming_normal(gain=1e9) - @test maximum(partial_kn(8, 8)) > 1e9 / 2 - @test maximum(partial_kn(8, 8, gain=1)) < 1e9 / 2 - - partial_si = sparse_init(sparsity=1) - @test maximum(partial_si(8, 8)) == 0 - @test maximum(partial_si(8, 8, sparsity=0)) > 0 - end - - @testset "identity_init" begin - @testset "Basic" begin - partial = identity_init(gain=3) - @test partial(3, 3) == identity_init(3, 3; gain=3) == [3 0 0; 0 3 0; 0 0 3] - @test eltype(identity_init(3, 4; gain=1.5)) == Float32 # despite Float64 keyword - end - @testset "Non-identity sizes" begin - @test identity_init(2, 3)[:, end] == zeros(Float32, 2) - @test identity_init(3, 2; shift=1)[1, :] == zeros(Float32, 2) - @test identity_init(1, 1, 3, 4)[:, :, :, end] == zeros(Float32, 1, 1, 3) - @test identity_init(2, 1, 3, 3)[end, :, :, :] == zeros(Float32, 1, 3, 3) - @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3) - end - @testset "Dense ID mapping" begin - l = Dense(3,3, init = identity_init) - - indata = reshape(collect(Float32, 1:9), 3, 3) - @test l(indata) == indata - end - @testset "$layer ID mapping with kernelsize $kernelsize" for layer in (Conv, ConvTranspose, CrossCor), kernelsize in ( - (1,), - (3,), - (1, 3), - (3, 5), - (3, 5, 7)) - nch = 3 - l = layer(kernelsize, nch=>nch, init=identity_init, pad=SamePad()) - - indata = randn(Float32, kernelsize..., nch, nch) - @test l(indata) == indata - end - @testset "Inception identity" begin - insize = 7 - path1 = Conv((1, 3), insize=>2; init=identity_init, pad=SamePad()) - path2 = Conv((3, 5), insize=>3; init=identity_init(shift=(0, 0, 2, 0)), pad=SamePad()) - path3 = Conv((5, 7), insize=>2; init=identity_init(shift=(0, 0, 5, 0)), pad=SamePad()) - block = Parallel((xs...) -> cat(xs...;dims=3), path1, path2, path3) - - indata = randn(Float32, 9, 9, 7, 2) - @test block(indata) == indata - end - end end @testset "Params" begin - m = Dense(10, 5) - @test size.(params(m)) == [(5, 10), (5,)] - m = RNN(10, 5) - @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)] - - # Layer duplicated in same chain, params just once pls. - c = Chain(m, m) - @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)] - - # Self-referential array. Just want params, no stack overflow pls. - r = Any[nothing,m] - r[1] = r - @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)] - - # Ensure functor explores inside Transpose but not SubArray - m = (x = view([1,2,3]pi, 1:2), y = transpose([4 5]pi)) - @test size.(Flux.params(m)) == [(2,), (1, 2)] + m = Dense(10, 5) + @test size.(params(m)) == [(5, 10), (5,)] + m = RNN(10, 5) + @test size.(params(m)) == [(5, 10), (5, 5), (5,), (5, 1)] + + # Layer duplicated in same chain, params just once pls. + c = Chain(m, m) + @test size.(params(c)) == [(5, 10), (5, 5), (5,), (5, 1)] + + # Self-referential array. Just want params, no stack overflow pls. + r = Any[nothing, m] + r[1] = r + @test size.(params(r)) == [(5, 10), (5, 5), (5,), (5, 1)] + + # Ensure functor explores inside Transpose but not SubArray + m = (x = view([1, 2, 3]pi, 1:2), y = transpose([4 5]pi)) + @test size.(Flux.params(m)) == [(2,), (1, 2)] end @testset "Precision" begin - m = Chain(Dense(10, 5, relu), Dense(5, 2)) - x64 = rand(Float64, 10) - x32 = rand(Float32, 10) - @test eltype(m[1].weight) == Float32 - @test eltype(m(x32)) == Float32 - @test eltype(m(x64)) == Float64 - @test eltype(f64(m)(x32)) == Float64 - @test eltype(f64(m)(x64)) == Float64 - @test eltype(f64(m)[1].weight) == Float64 - @test eltype(f32(f64(m))[1].weight) == Float32 + m = Chain(Dense(10, 5, relu), Dense(5, 2)) + x64 = rand(Float64, 10) + x32 = rand(Float32, 10) + @test eltype(m[1].weight) == Float32 + @test eltype(m(x32)) == Float32 + @test eltype(m(x64)) == Float64 + @test eltype(f64(m)(x32)) == Float64 + @test eltype(f64(m)(x64)) == Float64 + @test eltype(f64(m)[1].weight) == Float64 + @test eltype(f32(f64(m))[1].weight) == Float32 end @testset "zero bias" begin - m = Dense(3 => 2; bias=false) - @test f64(m).bias === m.bias === false - @test f32(m).bias === m.bias === false + m = Dense(3 => 2; bias = false) + @test f64(m).bias === m.bias === false + @test f32(m).bias === m.bias === false - @testset "Gradients for broadcasted $op with sizes $s" for op in (+,-,*), s in ((1,), (2,3)) - o = ones(s) - z = zeros(s) + @testset "Gradients for broadcasted $op with sizes $s" for op in (+, -, *), + s in ((1,), (2, 3)) - @testset "Explicit" begin - gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...) - g = gfun(o, z) - @test gfun(o, false) == (g[1], nothing) + o = ones(s) + z = zeros(s) - g = gfun(z, o) - @test gfun(false, o) == (nothing, g[2]) - end + @testset "Explicit" begin + gfun(args...) = gradient((x, y) -> sum(op.(x, y)), args...) + g = gfun(o, z) + @test gfun(o, false) == (g[1], nothing) + + g = gfun(z, o) + @test gfun(false, o) == (nothing, g[2]) + end - @testset "Implicit" begin - gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args))) - g = gfun(o, z) + @testset "Implicit" begin + gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args))) + g = gfun(o, z) - gres = gfun(o, false) - @test gres[o] == g[o] - @test false ∉ gres.params + gres = gfun(o, false) + @test gres[o] == g[o] + @test false ∉ gres.params - g = gfun(z, o) - gres = gfun(false, o) - @test gres[o] == g[o] - @test false ∉ gres.params + g = gfun(z, o) + gres = gfun(false, o) + @test gres[o] == g[o] + @test false ∉ gres.params + end end - end end @testset "unsqueeze" begin - x = randn(2, 3, 2) - @test @inferred(unsqueeze(x, dims=1)) == reshape(x, 1, 2, 3, 2) - @test @inferred(unsqueeze(x, dims=2)) == reshape(x, 2, 1, 3, 2) - @test @inferred(unsqueeze(x, dims=3)) == reshape(x, 2, 3, 1, 2) - @test @inferred(unsqueeze(x, dims=4)) == reshape(x, 2, 3, 2, 1) + x = randn(2, 3, 2) + @test @inferred(unsqueeze(x, dims = 1)) == reshape(x, 1, 2, 3, 2) + @test @inferred(unsqueeze(x, dims = 2)) == reshape(x, 2, 1, 3, 2) + @test @inferred(unsqueeze(x, dims = 3)) == reshape(x, 2, 3, 1, 2) + @test @inferred(unsqueeze(x, dims = 4)) == reshape(x, 2, 3, 2, 1) end @testset "Stacking" begin - x = randn(3,3) - stacked = MLUtils.stack([x, x], dims=2) - @test size(stacked) == (3,2,3) - - stacked_array=[ 8 9 3 5; 9 6 6 9; 9 1 7 2; 7 4 10 6 ] - unstacked_array=[[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] - @test unstack(stacked_array, dims=2) == unstacked_array - @test MLUtils.stack(unstacked_array, dims=2) == stacked_array - @test MLUtils.stack(unstack(stacked_array, dims=1), dims=1) == stacked_array + x = randn(3, 3) + stacked = MLUtils.stack([x, x], dims = 2) + @test size(stacked) == (3, 2, 3) + + stacked_array = [8 9 3 5; 9 6 6 9; 9 1 7 2; 7 4 10 6] + unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] + @test unstack(stacked_array, dims = 2) == unstacked_array + @test MLUtils.stack(unstacked_array, dims = 2) == stacked_array + @test MLUtils.stack(unstack(stacked_array, dims = 1), dims = 1) == stacked_array end @testset "Batching" begin - stacked_array=[ 8 9 3 5 - 9 6 6 9 - 9 1 7 2 - 7 4 10 6 ] - unstacked_array=[[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] - @test unbatch(stacked_array) == unstacked_array - @test batch(unstacked_array) == stacked_array - - # no-op for vector of non-arrays - @test batch([1,2,3]) == [1,2,3] - @test unbatch([1,2,3]) == [1,2,3] - - # generic iterable - @test batch(ones(2) for i=1:3) == ones(2, 3) - @test unbatch(ones(2, 3)) == [ones(2) for i=1:3] + stacked_array = [8 9 3 5 + 9 6 6 9 + 9 1 7 2 + 7 4 10 6] + unstacked_array = [[8, 9, 9, 7], [9, 6, 1, 4], [3, 6, 7, 10], [5, 9, 2, 6]] + @test unbatch(stacked_array) == unstacked_array + @test batch(unstacked_array) == stacked_array + + # no-op for vector of non-arrays + @test batch([1, 2, 3]) == [1, 2, 3] + @test unbatch([1, 2, 3]) == [1, 2, 3] + + # generic iterable + @test batch(ones(2) for i in 1:3) == ones(2, 3) + @test unbatch(ones(2, 3)) == [ones(2) for i in 1:3] end @testset "Param remapping" begin - ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense - dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout)) - dm(bias) = Chain( - dl(3, 5, bias), - dl(5, 4, bias), - dl(4, 3, bias) - ) - - nobias(n) = false - testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, dm(bt))) - @test l1.weight == l2.weight - @test l1.bias == l2.bias - @test_skip typeof(l1.bias) === typeof(l2.bias) - end - - @testset "loadparams!" begin - pars(w, b) = [w, b] - pars(l) = pars(l.weight, l.bias) - pararray(m) = mapreduce(pars, vcat, m) - weights(m) = mapreduce(l -> [l.weight], vcat, m) - @testset "Bias type $bt" for bt in (Flux.zeros32, nobias) - m = dm(bt) - Flux.loadparams!(m, params(m)) - testdense(m, bt) - end - end - - @testset "loadmodel!(dst, src)" begin - m1 = Chain(Dense(10, 5), Dense(5, 2, relu)) - m2 = Chain(Dense(10, 5), Dense(5, 2)) - m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2)) - m4 = Chain(Dense(10, 6), Dense(6, 2)) - m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2))) - m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2))) - - loadmodel!(m1, m2) - # trainable parameters copy over - @test m1[1].weight == m2[1].weight - @test m1[1].bias == m2[1].bias - # non-array leaves are untouched - @test m1[2].σ == relu - - loadmodel!(m5, m6) - # more complex nested structures also work - @test m5[1].weight == m6[1].weight - @test m5[2][1].weight == m6[2][1].weight - # false bias is not overwritten - @test m5[2][1].bias == false - - # mismatched nodes throw an error - @test_throws ArgumentError loadmodel!(m1, m3) - @test_throws ArgumentError loadmodel!(m1, m5) - # size mismatches throw an error - @test_throws DimensionMismatch loadmodel!(m1, m4) - - # tests for BatchNorm and Dropout - m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2)) - m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), Dropout(0.1)) - m2[2].μ .= rand(Float32, size(m2[2].μ)...) - loadmodel!(m1, m2) - # non-trainable parameters are copied as well - @test m1[2].μ == m2[2].μ - # functions are not copied - @test m1[3] == Flux.flatten - # dropout rate is not copied - @test m1[4].p == 0.2 - - # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33) - # tests Chain(...) vs Chain([...]) - # tests MaxPool - # tests testmode!/trainmode! is not copied - # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model - chain1 = Chain(Dropout(0.2), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 32 => 16, relu), - Dropout(0.2), - MaxPool((2, 2)), - Dropout(0.2), - Conv((3, 3), 16 => 10, relu), - Dropout(0.2), - x -> reshape(x, :, size(x, 4)), - Dropout(0.2), - Dense(90, 10), - softmax) - chain2 = Chain([Dropout(0.1), - Conv((3, 3), 1 => 32, relu), - BatchNorm(32, relu), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 32 => 16, relu), - Dropout(0.1), - MaxPool((3, 3)), - Dropout(0.1), - Conv((3, 3), 16 => 10, relu), - Dropout(0.1), - x -> reshape(x, :, size(x, 4)), - Dropout(0.1), - Dense(90, 10), - softmax]) - chain2[3].μ .= 5f0 - chain2[3].σ² .= 2f0 - testmode!(chain2) - loadmodel!(chain1, chain2) - for (dst, src) in zip(chain1, chain2) - if dst isa Dropout - @test dst.p == 0.2 - elseif dst isa Union{Conv, Dense} - @test dst.weight == src.weight - @test dst.bias == src.bias - elseif dst isa MaxPool - @test dst.k == (2, 2) - elseif dst isa BatchNorm - @test dst.μ == src.μ - @test dst.σ² == src.σ² - @test isnothing(dst.active) - end - end - - # copy only a subset of the model - chain1[end - 1].weight .= 1f0 - chain1[3].μ .= 3f0 - chain1[2].bias .= 5f0 - loadmodel!(chain2[end - 1], chain1[end - 1]) - loadmodel!(chain2[3], chain1[3]) - @test chain2[end - 1].weight == chain1[end - 1].weight - @test chain2[3].μ == chain1[3].μ - @test chain2[2].bias != chain1[2].bias - - # test shared weights - shared_dst = Dense(10 => 10) - shared_src = Dense(10 => 10) - # matched weights are okay - m1 = Chain(shared_dst, Dense(shared_dst.weight)) - m2 = Chain(shared_src, Dense(shared_src.weight)) - loadmodel!(m1, m2) - @test m1[1].weight === m1[2].weight - @test m1[1].weight == m2[2].weight - # mismatched weights are an error - m2 = Chain(Dense(10 => 10), Dense(10 => 10)) - @test_throws ErrorException loadmodel!(m1, m2) - # loading into tied weights with absent parameter is okay when the dst == zero - b = Flux.zeros32(5) - m1 = Chain(Dense(10 => 5; bias = b), Dense(5 => 5; bias = b)) - m2 = Chain(Dense(10 => 5; bias = Flux.zeros32(5)), Dense(5 => 5; bias = false)) - loadmodel!(m1, m2) - @test m1[1].bias === m1[2].bias - @test iszero(m1[1].bias) - # loading into tied weights with absent parameter is bad when the dst != zero - m2[1].bias .= 1 - @test_throws ErrorException loadmodel!(m1, m2) - - @testset "loadmodel! & filter" begin - m1 = Chain(Dense(10, 5), Dense(5, 2, relu)) - m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2)) - m3 = Chain(Dense(10, 5), Dense(5, 2, relu)) - - # this will not error cause Dropout is skipped - loadmodel!(m1, m2; filter = x -> !(x isa Dropout)) - @test m1[1].weight == m2[1].weight - @test m1[2].weight == m2[3].weight - - # this will not error cause Dropout is skipped - loadmodel!(m2, m3; filter = x -> !(x isa Dropout)) - @test m3[1].weight == m2[1].weight - @test m3[2].weight == m2[3].weight - end - - @testset "loadmodel! & absent bias" begin - m0 = Chain(Dense(2 => 3; bias=false, init = Flux.ones32), Dense(3 => 1)) - m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1)) - m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1)) - - Flux.loadmodel!(m1, m2) - @test m1[1].bias == 7:9 - @test sum(m1[1].weight) == 21 - - # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it - m1 = Flux.loadmodel!(m1, m0) - @test iszero(m1[1].bias) - @test sum(m1[1].weight) == 6 # written before error - - # load into a model without bias -- should it ignore the parameter which has no home, or error? - m0 = Flux.loadmodel!(m0, m2) - @test iszero(m0[1].bias) # obviously unchanged - @test sum(m0[1].weight) == 21 - end - end - - @testset "destructure" begin - import Flux: destructure - @testset "Bias type $bt" for bt in (zeros, nobias) - m = dm(bt) - p, re = destructure(m) - testdense(re(p), bt) + ls(dims...) = reshape(collect(Float32, 1:prod(dims)), dims...) # accepts dims in reverse order to Dense + dl(nin, nout, bias) = Dense(ls(nout, nin), bias(nout)) + dm(bias) = Chain(dl(3, 5, bias), + dl(5, 4, bias), + dl(4, 3, bias)) + + nobias(n) = false + testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, + dm(bt))) + @test l1.weight == l2.weight + @test l1.bias == l2.bias + @test_skip typeof(l1.bias) === typeof(l2.bias) + end + + @testset "loadparams!" begin + pars(w, b) = [w, b] + pars(l) = pars(l.weight, l.bias) + pararray(m) = mapreduce(pars, vcat, m) + weights(m) = mapreduce(l -> [l.weight], vcat, m) + @testset "Bias type $bt" for bt in (Flux.zeros32, nobias) + m = dm(bt) + Flux.loadparams!(m, params(m)) + testdense(m, bt) + end + end + + @testset "loadmodel!(dst, src)" begin + m1 = Chain(Dense(10, 5), Dense(5, 2, relu)) + m2 = Chain(Dense(10, 5), Dense(5, 2)) + m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2)) + m4 = Chain(Dense(10, 6), Dense(6, 2)) + m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2))) + m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2))) + + loadmodel!(m1, m2) + # trainable parameters copy over + @test m1[1].weight == m2[1].weight + @test m1[1].bias == m2[1].bias + # non-array leaves are untouched + @test m1[2].σ == relu + + loadmodel!(m5, m6) + # more complex nested structures also work + @test m5[1].weight == m6[1].weight + @test m5[2][1].weight == m6[2][1].weight + # false bias is not overwritten + @test m5[2][1].bias == false + + # mismatched nodes throw an error + @test_throws ArgumentError loadmodel!(m1, m3) + @test_throws ArgumentError loadmodel!(m1, m5) + # size mismatches throw an error + @test_throws DimensionMismatch loadmodel!(m1, m4) + + # tests for BatchNorm and Dropout + m1 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), Flux.flatten, Dropout(0.2)) + m2 = Chain(Conv((3, 3), 3 => 16), BatchNorm(16), x -> reshape(x, :, size(x)[end]), + Dropout(0.1)) + m2[2].μ .= rand(Float32, size(m2[2].μ)...) + loadmodel!(m1, m2) + # non-trainable parameters are copied as well + @test m1[2].μ == m2[2].μ + # functions are not copied + @test m1[3] == Flux.flatten + # dropout rate is not copied + @test m1[4].p == 0.2 + + # from LegolasFlux (https://github.com/beacon-biosignals/LegolasFlux.jl/blob/80569ab63a8248a8a063c76e0bbf701f4ada9bd4/examples/digits.jl#L33) + # tests Chain(...) vs Chain([...]) + # tests MaxPool + # tests testmode!/trainmode! is not copied + # tests Dense, Conv, BatchNorm, Dropout (like above) but in a bigger model + chain1 = Chain(Dropout(0.2), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 32 => 16, relu), + Dropout(0.2), + MaxPool((2, 2)), + Dropout(0.2), + Conv((3, 3), 16 => 10, relu), + Dropout(0.2), + x -> reshape(x, :, size(x, 4)), + Dropout(0.2), + Dense(90, 10), + softmax) + chain2 = Chain([Dropout(0.1), + Conv((3, 3), 1 => 32, relu), + BatchNorm(32, relu), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 32 => 16, relu), + Dropout(0.1), + MaxPool((3, 3)), + Dropout(0.1), + Conv((3, 3), 16 => 10, relu), + Dropout(0.1), + x -> reshape(x, :, size(x, 4)), + Dropout(0.1), + Dense(90, 10), + softmax]) + chain2[3].μ .= 5.0f0 + chain2[3].σ² .= 2.0f0 + testmode!(chain2) + loadmodel!(chain1, chain2) + for (dst, src) in zip(chain1, chain2) + if dst isa Dropout + @test dst.p == 0.2 + elseif dst isa Union{Conv, Dense} + @test dst.weight == src.weight + @test dst.bias == src.bias + elseif dst isa MaxPool + @test dst.k == (2, 2) + elseif dst isa BatchNorm + @test dst.μ == src.μ + @test dst.σ² == src.σ² + @test isnothing(dst.active) + end + end + + # copy only a subset of the model + chain1[end - 1].weight .= 1.0f0 + chain1[3].μ .= 3.0f0 + chain1[2].bias .= 5.0f0 + loadmodel!(chain2[end - 1], chain1[end - 1]) + loadmodel!(chain2[3], chain1[3]) + @test chain2[end - 1].weight == chain1[end - 1].weight + @test chain2[3].μ == chain1[3].μ + @test chain2[2].bias != chain1[2].bias + + # test shared weights + shared_dst = Dense(10 => 10) + shared_src = Dense(10 => 10) + # matched weights are okay + m1 = Chain(shared_dst, Dense(shared_dst.weight)) + m2 = Chain(shared_src, Dense(shared_src.weight)) + loadmodel!(m1, m2) + @test m1[1].weight === m1[2].weight + @test m1[1].weight == m2[2].weight + # mismatched weights are an error + m2 = Chain(Dense(10 => 10), Dense(10 => 10)) + @test_throws ErrorException loadmodel!(m1, m2) + # loading into tied weights with absent parameter is okay when the dst == zero + b = Flux.zeros32(5) + m1 = Chain(Dense(10 => 5; bias = b), Dense(5 => 5; bias = b)) + m2 = Chain(Dense(10 => 5; bias = Flux.zeros32(5)), Dense(5 => 5; bias = false)) + loadmodel!(m1, m2) + @test m1[1].bias === m1[2].bias + @test iszero(m1[1].bias) + # loading into tied weights with absent parameter is bad when the dst != zero + m2[1].bias .= 1 + @test_throws ErrorException loadmodel!(m1, m2) + + @testset "loadmodel! & filter" begin + m1 = Chain(Dense(10, 5), Dense(5, 2, relu)) + m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2)) + m3 = Chain(Dense(10, 5), Dense(5, 2, relu)) + + # this will not error cause Dropout is skipped + loadmodel!(m1, m2; filter = x -> !(x isa Dropout)) + @test m1[1].weight == m2[1].weight + @test m1[2].weight == m2[3].weight + + # this will not error cause Dropout is skipped + loadmodel!(m2, m3; filter = x -> !(x isa Dropout)) + @test m3[1].weight == m2[1].weight + @test m3[2].weight == m2[3].weight + end + + @testset "loadmodel! & absent bias" begin + m0 = Chain(Dense(2 => 3; bias = false, init = Flux.ones32), Dense(3 => 1)) + m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1)) + m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1)) + + Flux.loadmodel!(m1, m2) + @test m1[1].bias == 7:9 + @test sum(m1[1].weight) == 21 + + # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it + m1 = Flux.loadmodel!(m1, m0) + @test iszero(m1[1].bias) + @test sum(m1[1].weight) == 6 # written before error + + # load into a model without bias -- should it ignore the parameter which has no home, or error? + m0 = Flux.loadmodel!(m0, m2) + @test iszero(m0[1].bias) # obviously unchanged + @test sum(m0[1].weight) == 21 + end + end + + @testset "destructure" begin + import Flux: destructure + @testset "Bias type $bt" for bt in (zeros, nobias) + m = dm(bt) + p, re = destructure(m) + testdense(re(p), bt) + end + + @testset "restructure in gradient" begin + x = rand(Float32, 3, 1) + m = dm(zeros) + ∇m = gradient(m -> sum(m(x)), m)[1] + p, re = destructure(m) + ∇p = gradient(θ -> sum(re(θ)(x)), p)[1] + @test ∇p ≈ destructure(∇m)[1] + end end - - @testset "restructure in gradient" begin - x = rand(Float32, 3, 1) - m = dm(zeros) - ∇m = gradient(m -> sum(m(x)), m)[1] - p, re = destructure(m) - ∇p = gradient(θ -> sum(re(θ)(x)), p)[1] - @test ∇p ≈ destructure(∇m)[1] - end - end end @testset "Train and test mode" begin - mutable struct DummyLayer - testing::Bool - end - Flux.testmode!(m::DummyLayer, testing=true) = (m.testing = testing; m) - - c = Chain(DummyLayer(true)) - testmode!(c) - @test c[1].testing - trainmode!(c) - @test !c[1].testing + mutable struct DummyLayer + testing::Bool + end + Flux.testmode!(m::DummyLayer, testing = true) = (m.testing = testing; m) + + c = Chain(DummyLayer(true)) + testmode!(c) + @test c[1].testing + trainmode!(c) + @test !c[1].testing end @testset "modules" begin - m1 = Conv((2,3), 4=>5; pad=6, stride=7) - m2 = LayerNorm(8) - m3 = m2.diag - m4 = SkipConnection(m1, +) - m5 = Chain(m4, m2) - modules = Flux.modules(m5) - # Depth-first descent - @test length(modules) == 6 - @test modules[1] === m5 - @test modules[3] === m4 - @test modules[4] === m1 - @test modules[5] === m2 - @test modules[6] === m3 - - mod_par = Flux.modules(Parallel(Flux.Bilinear(2,2,2,cbrt), Dense(2,2,abs), Dense(2,2,abs2))) - @test length(mod_par) == 5 - - mod_rnn = Flux.modules(Chain(Dense(2,3), BatchNorm(3), LSTM(3,4))) - @test length(mod_rnn) == 6 - @test mod_rnn[end] isa Flux.LSTMCell - - mod_skip = Flux.modules(Chain(SkipConnection( - Conv((2,3), 4=>5; pad=6, stride=7), - +), - LayerNorm(8))) - @test length(mod_skip) == 6 - @test mod_skip[end] isa Flux.Scale + m1 = Conv((2, 3), 4 => 5; pad = 6, stride = 7) + m2 = LayerNorm(8) + m3 = m2.diag + m4 = SkipConnection(m1, +) + m5 = Chain(m4, m2) + modules = Flux.modules(m5) + # Depth-first descent + @test length(modules) == 6 + @test modules[1] === m5 + @test modules[3] === m4 + @test modules[4] === m1 + @test modules[5] === m2 + @test modules[6] === m3 + + mod_par = Flux.modules(Parallel(Flux.Bilinear(2, 2, 2, cbrt), Dense(2, 2, abs), + Dense(2, 2, abs2))) + @test length(mod_par) == 5 + + mod_rnn = Flux.modules(Chain(Dense(2, 3), BatchNorm(3), LSTM(3, 4))) + @test length(mod_rnn) == 6 + @test mod_rnn[end] isa Flux.LSTMCell + + mod_skip = Flux.modules(Chain(SkipConnection(Conv((2, 3), 4 => 5; pad = 6, stride = 7), + +), + LayerNorm(8))) + @test length(mod_skip) == 6 + @test mod_skip[end] isa Flux.Scale end @testset "Patience triggers" begin - @testset "patience" begin - trigger = Flux.patience(() -> true, 3) + @testset "patience" begin + trigger = Flux.patience(() -> true, 3) - @test trigger() == false - @test trigger() == false - @test trigger() == true + @test trigger() == false + @test trigger() == false + @test trigger() == true - v = [false, true, false, true, true, true] - trigger = let v = v - Flux.patience(i -> v[i], 3) - end + v = [false, true, false, true, true, true] + trigger = let v = v + Flux.patience(i -> v[i], 3) + end - n_iter = 0 - for i in 1:length(v) - trigger(i) && break - n_iter += 1 - end + n_iter = 0 + for i in 1:length(v) + trigger(i) && break + n_iter += 1 + end - @test n_iter == 5 - end + @test n_iter == 5 + end - @testset "early stopping" begin - @testset "args & kwargs" begin - es = Flux.early_stopping((x; y = 1) -> x + y, 10; min_dist=3) + @testset "early stopping" begin + @testset "args & kwargs" begin + es = Flux.early_stopping((x; y = 1) -> x + y, 10; min_dist = 3) - n_iter = 0 - while n_iter < 99 - es(-n_iter; y=-n_iter) && break - n_iter += 1 - end + n_iter = 0 + while n_iter < 99 + es(-n_iter; y = -n_iter) && break + n_iter += 1 + end - @test n_iter == 9 - end + @test n_iter == 9 + end - @testset "distance" begin - es = Flux.early_stopping(identity, 10; distance=(best_score, score) -> score - best_score) + @testset "distance" begin + es = Flux.early_stopping(identity, 10; + distance = (best_score, score) -> score - best_score) - n_iter = 0 - while n_iter < 99 - es(n_iter) && break - n_iter += 1 - end + n_iter = 0 + while n_iter < 99 + es(n_iter) && break + n_iter += 1 + end - @test n_iter == 99 - end + @test n_iter == 99 + end - @testset "init_score" begin - es = Flux.early_stopping(identity, 10; init_score=10) + @testset "init_score" begin + es = Flux.early_stopping(identity, 10; init_score = 10) - n_iter = 0 - while n_iter < 99 - es(n_iter) && break - n_iter += 1 - end + n_iter = 0 + while n_iter < 99 + es(n_iter) && break + n_iter += 1 + end - @test n_iter == 10 + @test n_iter == 10 + end end - end - @testset "plateau" begin - f = let v = 10 - () -> v = v / abs(v) - v - end + @testset "plateau" begin + f = let v = 10 + () -> v = v / abs(v) - v + end - trigger = Flux.plateau(f, 3, init_score=10, min_dist=18) + trigger = Flux.plateau(f, 3, init_score = 10, min_dist = 18) - n_iter = 0 - while n_iter < 99 - trigger() && break - n_iter += 1 - end + n_iter = 0 + while n_iter < 99 + trigger() && break + n_iter += 1 + end - @test n_iter == 3 - end + @test n_iter == 3 + end end @testset "Shared parameters" begin - mat = [1 2; 3 4.0] - simple = ((nothing, mat, (3, mat, 4))) - @test length(Flux.params(simple)) == 1 - - oneadj = (nt = (m = mat, a = mat')) - @test length(Flux.params(oneadj)) == 1 # needs Functors@0.3 - - @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4] -end - -@testset "Various destructure bugs" begin + mat = [1 2; 3 4.0] + simple = ((nothing, mat, (3, mat, 4))) + @test length(Flux.params(simple)) == 1 - @testset "issue 1601" begin - struct TwoDenses - dense::Dense - dense2::Dense - end - Flux.@functor TwoDenses + oneadj = (nt = (m = mat, a = mat')) + @test length(Flux.params(oneadj)) == 1 # needs Functors@0.3 - function (m::TwoDenses)(x) - out = m.dense(x) - end + @test Flux.destructure(simple)[1] == Flux.destructure(oneadj)[1] == [1, 3, 2, 4] +end - model = TwoDenses( - Dense(3,1), - Dense(3,2) - ) - p, re = Flux.destructure(model) +@testset "Various destructure bugs" begin + @testset "issue 1601" begin + struct TwoDenses + dense::Dense + dense2::Dense + end + Flux.@functor TwoDenses - x = [1., 2., 3.] - y, back = Flux.Zygote.pullback((x, p) -> re(p)(x), x, p) + function (m::TwoDenses)(x) + return out = m.dense(x) + end - dy = [4.] - dx, dp = back(dy) - @test length(p) == length(dp) - end + model = TwoDenses(Dense(3, 1), + Dense(3, 2)) + p, re = Flux.destructure(model) - @testset "issue 1727" begin - p, re = Flux.destructure(BatchNorm(3)) # 6 parameters, plus 6 non-trainable - @test length(p) == 6 + x = [1.0, 2.0, 3.0] + y, back = Flux.Zygote.pullback((x, p) -> re(p)(x), x, p) - x = rand(Float32, 3, 4) - y, back = Flux.pullback(x, p) do x, p - vec(re(p)(x)) + dy = [4.0] + dx, dp = back(dy) + @test length(p) == length(dp) end - @test_nowarn back(y) - b = back(y) - @test size(b[1]) == size(x) - @test size(b[2]) == size(p) - end + @testset "issue 1727" begin + p, re = Flux.destructure(BatchNorm(3)) # 6 parameters, plus 6 non-trainable + @test length(p) == 6 + + x = rand(Float32, 3, 4) + y, back = Flux.pullback(x, p) do x, p + return vec(re(p)(x)) + end + @test_nowarn back(y) + b = back(y) - @testset "issue 1767" begin - struct Model{A} - a::A - b::A + @test size(b[1]) == size(x) + @test size(b[2]) == size(p) end - Flux.@functor Model - (m::Model)(x) = m.a(x) .+ m.b(x) - d = Dense(1, 1) - x = rand(Float32, 1, 1) + @testset "issue 1767" begin + struct Model{A} + a::A + b::A + end + Flux.@functor Model + (m::Model)(x) = m.a(x) .+ m.b(x) - # Sharing the parameters - model = Model(d, d) + d = Dense(1, 1) + x = rand(Float32, 1, 1) - # Works - g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model)) + # Sharing the parameters + model = Model(d, d) - p, re = Flux.destructure(model) - # Fails - g2 = Flux.gradient(p -> sum(re(p)(x)), p) + # Works + g1 = Flux.gradient(() -> sum(model(x)), Flux.params(model)) - @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias]) - end + p, re = Flux.destructure(model) + # Fails + g2 = Flux.gradient(p -> sum(re(p)(x)), p) - @testset "issue 1826" begin - struct Split{T} # taken from: https://fluxml.ai/Flux.jl/stable/models/advanced/#Multiple-outputs:-a-custom-Split-layer - paths::T + @test g2[1] ≈ vcat(g1[d.weight], g1[d.bias]) end - Split(paths...) = Split(paths) - Flux.@functor Split - (m::Split)(x::AbstractArray) = map(f -> f(x), m.paths) - n_input, n_batch, n_shared = 5, 13, 11 - n_outputs = [3, 7] + @testset "issue 1826" begin + struct Split{T} # taken from: https://fluxml.ai/Flux.jl/stable/models/advanced/#Multiple-outputs:-a-custom-Split-layer + paths::T + end + Split(paths...) = Split(paths) + Flux.@functor Split + (m::Split)(x::AbstractArray) = map(f -> f(x), m.paths) - data = rand(Float32, n_input, n_batch) - model = Chain( - Dense(n_input, n_shared), - Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2])) - ) + n_input, n_batch, n_shared = 5, 13, 11 + n_outputs = [3, 7] - pvec, re = Flux.destructure(model) - loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx]) # loss wrt `idx`th output term + data = rand(Float32, n_input, n_batch) + model = Chain(Dense(n_input, n_shared), + Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2]))) - g = Flux.Zygote.ForwardDiff.gradient(pv -> loss(data, 1, pv), pvec) - @test g ≈ Flux.Zygote.gradient(pv -> loss(data, 1, pv), pvec)[1] - end -end + pvec, re = Flux.destructure(model) + loss(x, idx, pv) = sum(abs2, re(pv)(x)[idx]) # loss wrt `idx`th output term -@testset "Rrule" begin - @testset "issue 2033" begin - if CUDA.functional() - struct Wrapped{T} - x::T - end - y, _ = Flux.pullback(Wrapped, cu(randn(3,3))) - @test y isa Wrapped{<:CuArray} + g = Flux.Zygote.ForwardDiff.gradient(pv -> loss(data, 1, pv), pvec) + @test g ≈ Flux.Zygote.gradient(pv -> loss(data, 1, pv), pvec)[1] end - end end +@testset "Rrule" begin @testset "issue 2033" begin if CUDA.functional() + struct Wrapped{T} + x::T + end + y, _ = Flux.pullback(Wrapped, cu(randn(3, 3))) + @test y isa Wrapped{<:CuArray} +end end end + # make sure rng_from_array is non_differentiable @testset "rng_from_array" begin - m(x) = (rand(rng_from_array(x)) * x)[1] - gradient(m, ones(2)) + m(x) = (rand(rng_from_array(x)) * x)[1] + gradient(m, ones(2)) end