FluxML · DhairyaLGandhi · Feb 16, 2021 · Feb 16, 2021 · Feb 16, 2021 · Feb 16, 2021
diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
@@ -1,14 +1,13 @@
 import CUDA.CUDNN: batchnorm, ∇batchnorm
 
-function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
-                              cache=nothing) where T<:Union{Float32, Float64}
+function (BN::Flux.BatchNorm)(x::CuArray{T},
+                              cache = nothing) where T<:Union{Float32, Float64}
 
-  @assert BN.affine "BatchNorm: only affine=true supported on gpu"
-  @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-  @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels"
+  @assert BN.affine throw(ArgumentError("BatchNorm: only affine = true supported on gpu"))
+  @assert BN.track_stats throw(ArgumentError("BatchNorm: only track_stats = true supported on gpu"))
   return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; 
-                  cache=cache, alpha=1, beta=0, eps=BN.ϵ, 
-                  training=Flux._isactive(BN)))
+                  cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, 
+                  training = Flux._isactive(BN)))
 end
 
 @adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -51,6 +51,7 @@ end
 
 Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
 
+Does nothing to the input once [`Flux.testmode!`](@ref) is set to `true`.
 To apply dropout along certain dimension(s), specify the `dims` keyword.
 e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
 (also called 2D dropout).
@@ -118,7 +119,7 @@ testmode!(m::AlphaDropout, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
-    LayerNorm(sz, λ=identity; affine=true, ϵ=1fe-5)
+    LayerNorm(sz, λ = identity; affine = true, ϵ = 1fe-5)
 
 A [normalisation layer](https://arxiv.org/abs/1607.06450) designed to be
 used with recurrent hidden states. 
@@ -129,77 +130,89 @@ The input is normalised along the first `length(sz)` dimensions
 for tuple `sz`, along the first dimension for integer `sz`.
 The input  is expected to have first dimensions' size equal to `sz`. 
 
-If `affine=true` also applies a learnable shift and rescaling
+If `affine = true` also applies a learnable shift and rescaling
 as in the [`Diagonal`](@ref) layer.
 
 
 Se also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
 """
-struct LayerNorm{F,D,T,N}
+struct LayerNorm{F,D,T,S}
   λ::F
   diag::D
   ϵ::T
-  size::NTuple{N,Int}
-  affine::Bool
+  sz::S
 end
 
-function LayerNorm(sz, λ=identity; affine=true, ϵ=1f-5)
-  sz = sz isa Integer ? (sz,) : sz
-  diag = affine ? Diagonal(sz...) : nothing
-  return LayerNorm(λ, diag, ϵ, sz, affine)
+function LayerNorm(sz, λ = identity; affine = true, ϵ = 1f-5)
+  diag = affine ? Diagonal(sz...) : identity
+  return LayerNorm(λ, diag, ϵ, sz)
 end
 
 @functor LayerNorm
 
 function (a::LayerNorm)(x)
-  x = normalise(x, dims=1:length(a.size), ϵ=a.ϵ)
-  a.diag === nothing ? a.λ.(x) : a.λ.(a.diag(x))
+  x = normalise(x, dims = 1:length(a.sz), ϵ = a.ϵ)
+  a.λ.(a.diag(x))
 end
 
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm($(l.size)")
-  a.λ == identity || print(io, ", $(a.λ)")
-  hasaffine(l) || print(io, ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = $(l.diag)")
   print(io, ")")
 end
 
 # For InstanceNorm, GroupNorm, and BatchNorm.
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function _norm_layer_forward(l, x::AbstractArray{T,N}; reduce_dims, affine_shape) where {T, N}
+function norm_forward(l, x::AbstractArray{T,N}; reduce_dims) where {T, N}
   if !_isactive(l) && l.track_stats # testmode with tracked stats
     stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
-    μ = mean(x; dims=reduce_dims)
-    σ² = mean((x .- μ).^2; dims=reduce_dims)
+    μ = mean(x; dims = reduce_dims)
+    σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
     if l.track_stats
       ## update moving mean/std
-      Zygote.ignore() do
-        mtm = l.momentum
-        m = prod(size(x, i) for i in reduce_dims)  # needed for computing corrected var
-        μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims=N))
-        σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims=N))
-        l.μ = (1-mtm) .* l.μ .+ mtm .* μnew
-        l.σ² = (1-mtm) .* l.σ² .+ mtm .* (m / (m - one(eltype(l.σ²)))) .* σ²new
-      end
+
+      μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = reduce_dims)
+      l.μ .= μ
+      l.σ² .= σ²
     end
   end
-  if hasaffine(l)
-    γ = reshape(l.γ, affine_shape)
-    β = reshape(l.β, affine_shape)
-    return l.λ.(γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ) .+ β)
-  else
-    return l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
-  end
+  μ, σ²
+  # affine(l, x, μ, σ², affine_shape)
+end
+
+function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
+  m = prod(size(x)[reduce_dims])
+  μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims = N))
+  σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims = N))
+  μ = (1 - mtm) .* μ .+ mtm .* μnew
+  σ² = (1 - mtm) .* σ² .+ mtm .* (m / (m - one(T))) .* σ²new
+  μ, σ²
+end
+@nograd track_stats
+
+function affine(l, x, μ, σ², affine_shape)
+  γ = reshape(l.γ, affine_shape)
+  β = reshape(l.β, affine_shape)
+  l.λ.((γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ)) .+ β)
 end
 
+affine(l, x, μ, σ², affine_shape::Nothing) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
+
+# function affine(l, x, μ, σ², affine_shape)
+#   res = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
+#   _affine(l.λ, res, affine_shape)
+# end
+
 """
-    BatchNorm(channels::Integer, λ=identity;
-              initβ=zeros, initγ=ones,
-              ϵ=1f-5, momentum= 0.1f0)
+    BatchNorm(channels::Integer, λ = identity;
+              initβ = zeros, initγ = ones,
+              ϵ = 1f-5, momentum = 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.
 `channels` should be the size of the channel dimension in your data (see below).
@@ -211,12 +224,12 @@ it's the usual channel dimension.
 `BatchNorm` computes the mean and variance for each `D_1×...×D_{N-2}×1×D_N` 
 input slice and normalises the input accordingly.
 
-If `affine=true`, it also applies  a shift and a rescale to the input 
+If `affine = true`, it also applies  a shift and a rescale to the input 
 through to learnable per-channel bias β and scale γ parameters.
 
 After normalisation, elementwise activation `λ` is applied.  
 
-If `track_stats=true`, accumulates mean and var statistics in training phase 
+If `track_stats = true`, accumulates mean and var statistics in training phase 
 that will be used to renormalize the input in test phase.
 
 Use [`testmode!`](@ref) during inference.
@@ -240,45 +253,47 @@ mutable struct BatchNorm{F,V,N,W}
   σ²::W    # moving var
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
-function BatchNorm(chs::Int, λ=identity;
-          initβ = i -> zeros(Float32, i), 
-          initγ = i -> ones(Float32, i), 
-          affine=true, track_stats=true,
-          ϵ=1f-5, momentum=0.1f0)
+function BatchNorm(chs::Int, λ = identity;
+                   initβ = i -> zeros(Float32, i), 
+                   initγ = i -> ones(Float32, i), 
+                   affine = true, track_stats = true,
+                   ϵ = 1f-5, momentum = 0.1f0)
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, chs)
+  σ² = ones(Float32, chs)
 
   return BatchNorm(chs, λ, β, γ,
-            μ, σ², ϵ, momentum, 
-            affine, track_stats, nothing)
+            μ, σ², ϵ, momentum)
+#             affine, track_stats, nothing)
 end
 
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
 
 function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
   N = ndims(x)
+  @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
+  affine_shape = BN.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
+  μ, σ² = norm_forward(BN, x;
+                       reduce_dims = reduce_dims)
+  affine(l, x, μ, σ², affine_shape)
 end
 
 testmode!(m::BatchNorm, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(l.chs)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io,  ", affine = ")
   print(io, ")")
 end
 
@@ -316,23 +331,22 @@ mutable struct InstanceNorm{F,V,N,W}
   σ²::W  # moving var
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
-function InstanceNorm(chs::Int, λ=identity;
+function InstanceNorm(chs::Int, λ = identity;
                     initβ = i -> zeros(Float32, i), 
                     initγ = i -> ones(Float32, i), 
-                    affine=false, track_stats=false,
-                    ϵ=1f-5, momentum=0.1f0)
-
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
-
-  return InstanceNorm(chs, λ, β, γ,
+                    affine = true, track_stats = true,
+                    ϵ = 1f-5, momentum = 0.1f0)
+
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, chs)
+  σ² = ones(Float32, chs)
+  InstanceNorm(chs, λ, β, γ,
             μ, σ², ϵ, momentum, 
             affine, track_stats, nothing)
 end
@@ -342,20 +356,21 @@ trainable(in::InstanceNorm) = hasaffine(in) ? (in.β, in.γ) : ()
 
 function (l::InstanceNorm)(x)
   @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == l.chs
+  # @assert size(x, ndims(x)-1) == l.chs
   N = ndims(x)
   reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(l, x; reduce_dims, affine_shape)
+  affine_shape = l.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
+  μ, σ² = norm_forward(l, x; reduce_dims = reduce_dims)
+  affine(l, x, μ, σ², affine_shape)
 end
 
 testmode!(m::InstanceNorm, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(l.chs)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = ")
   print(io, ")")
 end
 
@@ -395,26 +410,26 @@ mutable struct GroupNorm{F,V,N,W}
   σ²::W    # moving std
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
 @functor GroupNorm
 trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
 
-function GroupNorm(chs::Int, G::Int, λ=identity;
-              initβ = (i) -> zeros(Float32, i), 
-              initγ = (i) -> ones(Float32, i), 
-              affine=true, track_stats=false,
-              ϵ=1f-5, momentum=0.1f0) 
+function GroupNorm(chs::Int, G::Int, λ = identity;
+              initβ = i -> zeros(Float32, i), 
+              initγ = i -> ones(Float32, i), 
+              affine = true, track_stats = false,
+              ϵ = 1f-5, momentum = 0.1f0) 
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, G) : nothing
-  σ² = track_stats ? ones(Float32, G) : nothing
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, G)
+  σ² = ones(Float32, G)
 
   return GroupNorm(chs, G, λ, 
             β, γ,
@@ -425,33 +440,34 @@ end
 
 function (gn::GroupNorm)(x)
   @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == gn.chs
-  N = ndims(x)
+  # @assert size(x, ndims(x) - 1) == gn.chs
   sz = size(x)
-  x = reshape(x, sz[1:N-2]..., sz[N-1]÷gn.G, gn.G, sz[N])
+  x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   N = ndims(x)
   reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N)
-  x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
-  return reshape(x, sz)
+  affine_shape = gn.affine ? ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) : nothing
+  μ, σ² = norm_forward(gn, x;
+                       reduce_dims = reduce_dims)
+  res = affine(l, x, μ, σ², affine_shape)
+  return reshape(res, sz)
 end
 
 testmode!(m::GroupNorm, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(l.chs), $(l.G)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = ")
   print(io, ")")
 end
 
-"""
-  hasaffine(l)
-
-Return `true` if a normalisation layer has trainable shift and 
-scale parameters, `false` otherwise.
-
-See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
-"""
-hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine
+# """
+#   hasaffine(l)
+# 
+# Return `true` if a normalisation layer has trainable shift and 
+# scale parameters, `false` otherwise.
+# 
+# See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
+# """
+# hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine