From 2cd59b41c52ca6e73791b85ecf8ab2b009a2d7e0 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Feb 2021 14:34:07 +0530
Subject: [PATCH 01/44] clean up the history

---
 src/layers/normalise.jl | 220 +++++++++++++++++++++-------------------
 1 file changed, 118 insertions(+), 102 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index ddf5e922f9..312927a149 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -51,6 +51,7 @@ end
 
 Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
 
+Does nothing to the input once [`Flux.testmode!`](@ref) is set to `true`.
 To apply dropout along certain dimension(s), specify the `dims` keyword.
 e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
 (also called 2D dropout).
@@ -118,7 +119,7 @@ testmode!(m::AlphaDropout, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
-    LayerNorm(sz, λ=identity; affine=true, ϵ=1fe-5)
+    LayerNorm(sz, λ = identity; affine = true, ϵ = 1fe-5)
 
 A [normalisation layer](https://arxiv.org/abs/1607.06450) designed to be
 used with recurrent hidden states. 
@@ -129,37 +130,35 @@ The input is normalised along the first `length(sz)` dimensions
 for tuple `sz`, along the first dimension for integer `sz`.
 The input  is expected to have first dimensions' size equal to `sz`. 
 
-If `affine=true` also applies a learnable shift and rescaling
+If `affine = true` also applies a learnable shift and rescaling
 as in the [`Diagonal`](@ref) layer.
 
 
 Se also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
 """
-struct LayerNorm{F,D,T,N}
+struct LayerNorm{F,D,T,S}
   λ::F
   diag::D
   ϵ::T
-  size::NTuple{N,Int}
-  affine::Bool
+  sz::S
 end
 
-function LayerNorm(sz, λ=identity; affine=true, ϵ=1f-5)
-  sz = sz isa Integer ? (sz,) : sz
-  diag = affine ? Diagonal(sz...) : nothing
-  return LayerNorm(λ, diag, ϵ, sz, affine)
+function LayerNorm(sz, λ = identity; affine = true, ϵ = 1f-5)
+  diag = affine ? Diagonal(sz...) : identity
+  return LayerNorm(λ, diag, ϵ, sz)
 end
 
 @functor LayerNorm
 
 function (a::LayerNorm)(x)
-  x = normalise(x, dims=1:length(a.size), ϵ=a.ϵ)
-  a.diag === nothing ? a.λ.(x) : a.λ.(a.diag(x))
+  x = normalise(x, dims = 1:length(a.sz), ϵ = a.ϵ)
+  a.λ.(a.diag(x))
 end
 
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm($(l.size)")
-  a.λ == identity || print(io, ", $(a.λ)")
-  hasaffine(l) || print(io, ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = $(l.diag)")
   print(io, ")")
 end
 
@@ -167,39 +166,53 @@ end
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function _norm_layer_forward(l, x::AbstractArray{T,N}; reduce_dims, affine_shape) where {T, N}
+function norm_forward(l, x::AbstractArray{T,N}; reduce_dims) where {T, N}
   if !_isactive(l) && l.track_stats # testmode with tracked stats
     stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
-    μ = mean(x; dims=reduce_dims)
-    σ² = mean((x .- μ).^2; dims=reduce_dims)
+    μ = mean(x; dims = reduce_dims)
+    σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
     if l.track_stats
       ## update moving mean/std
-      Zygote.ignore() do
-        mtm = l.momentum
-        m = prod(size(x, i) for i in reduce_dims)  # needed for computing corrected var
-        μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims=N))
-        σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims=N))
-        l.μ = (1-mtm) .* l.μ .+ mtm .* μnew
-        l.σ² = (1-mtm) .* l.σ² .+ mtm .* (m / (m - one(eltype(l.σ²)))) .* σ²new
-      end
+
+      μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = reduce_dims)
+      l.μ .= μ
+      l.σ² .= σ²
     end
   end
-  if hasaffine(l)
-    γ = reshape(l.γ, affine_shape)
-    β = reshape(l.β, affine_shape)
-    return l.λ.(γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ) .+ β)
-  else
-    return l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
-  end
+  μ, σ²
+  # affine(l, x, μ, σ², affine_shape)
+end
+
+function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
+  m = prod(size(x)[reduce_dims])
+  μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims = N))
+  σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims = N))
+  μ = (1 - mtm) .* μ .+ mtm .* μnew
+  σ² = (1 - mtm) .* σ² .+ mtm .* (m / (m - one(T))) .* σ²new
+  μ, σ²
+end
+@nograd track_stats
+
+function affine(l, x, μ, σ², affine_shape)
+  γ = reshape(l.γ, affine_shape)
+  β = reshape(l.β, affine_shape)
+  l.λ.((γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ)) .+ β)
 end
 
+affine(l, x, μ, σ², affine_shape::Nothing) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
+
+# function affine(l, x, μ, σ², affine_shape)
+#   res = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
+#   _affine(l.λ, res, affine_shape)
+# end
+
 """
-    BatchNorm(channels::Integer, λ=identity;
-              initβ=zeros, initγ=ones,
-              ϵ=1f-5, momentum= 0.1f0)
+    BatchNorm(channels::Integer, λ = identity;
+              initβ = zeros, initγ = ones,
+              ϵ = 1f-5, momentum = 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.
 `channels` should be the size of the channel dimension in your data (see below).
@@ -211,12 +224,12 @@ it's the usual channel dimension.
 `BatchNorm` computes the mean and variance for each `D_1×...×D_{N-2}×1×D_N` 
 input slice and normalises the input accordingly.
 
-If `affine=true`, it also applies  a shift and a rescale to the input 
+If `affine = true`, it also applies  a shift and a rescale to the input 
 through to learnable per-channel bias β and scale γ parameters.
 
 After normalisation, elementwise activation `λ` is applied.  
 
-If `track_stats=true`, accumulates mean and var statistics in training phase 
+If `track_stats = true`, accumulates mean and var statistics in training phase 
 that will be used to renormalize the input in test phase.
 
 Use [`testmode!`](@ref) during inference.
@@ -240,36 +253,38 @@ mutable struct BatchNorm{F,V,N,W}
   σ²::W    # moving var
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
-function BatchNorm(chs::Int, λ=identity;
-          initβ = i -> zeros(Float32, i), 
-          initγ = i -> ones(Float32, i), 
-          affine=true, track_stats=true,
-          ϵ=1f-5, momentum=0.1f0)
+function BatchNorm(chs::Int, λ = identity;
+                   initβ = i -> zeros(Float32, i), 
+                   initγ = i -> ones(Float32, i), 
+                   affine = true, track_stats = true,
+                   ϵ = 1f-5, momentum = 0.1f0)
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, chs)
+  σ² = ones(Float32, chs)
 
   return BatchNorm(chs, λ, β, γ,
-            μ, σ², ϵ, momentum, 
-            affine, track_stats, nothing)
+            μ, σ², ϵ, momentum)
+#             affine, track_stats, nothing)
 end
 
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
 
 function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
   N = ndims(x)
+  @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
+  affine_shape = BN.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
+  μ, σ² = norm_forward(BN, x;
+                       reduce_dims = reduce_dims)
+  affine(l, x, μ, σ², affine_shape)
 end
 
 testmode!(m::BatchNorm, mode=true) =
@@ -277,8 +292,8 @@ testmode!(m::BatchNorm, mode=true) =
 
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(l.chs)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io,  ", affine = ")
   print(io, ")")
 end
 
@@ -316,23 +331,22 @@ mutable struct InstanceNorm{F,V,N,W}
   σ²::W  # moving var
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
-function InstanceNorm(chs::Int, λ=identity;
+function InstanceNorm(chs::Int, λ = identity;
                     initβ = i -> zeros(Float32, i), 
                     initγ = i -> ones(Float32, i), 
-                    affine=false, track_stats=false,
-                    ϵ=1f-5, momentum=0.1f0)
-
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
-
-  return InstanceNorm(chs, λ, β, γ,
+                    affine = true, track_stats = true,
+                    ϵ = 1f-5, momentum = 0.1f0)
+
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, chs)
+  σ² = ones(Float32, chs)
+  InstanceNorm(chs, λ, β, γ,
             μ, σ², ϵ, momentum, 
             affine, track_stats, nothing)
 end
@@ -342,11 +356,12 @@ trainable(in::InstanceNorm) = hasaffine(in) ? (in.β, in.γ) : ()
 
 function (l::InstanceNorm)(x)
   @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == l.chs
+  # @assert size(x, ndims(x)-1) == l.chs
   N = ndims(x)
   reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
-  return _norm_layer_forward(l, x; reduce_dims, affine_shape)
+  affine_shape = l.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
+  μ, σ² = norm_forward(l, x; reduce_dims = reduce_dims)
+  affine(l, x, μ, σ², affine_shape)
 end
 
 testmode!(m::InstanceNorm, mode=true) =
@@ -354,8 +369,8 @@ testmode!(m::InstanceNorm, mode=true) =
 
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(l.chs)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = ")
   print(io, ")")
 end
 
@@ -395,26 +410,26 @@ mutable struct GroupNorm{F,V,N,W}
   σ²::W    # moving std
   ϵ::N
   momentum::N
-  affine::Bool
-  track_stats::Bool
-  active::Union{Bool, Nothing}
+  # affine::Bool
+  # track_stats::Bool
+  # active::Union{Bool, Nothing}
 end
 
 @functor GroupNorm
 trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
 
-function GroupNorm(chs::Int, G::Int, λ=identity;
-              initβ = (i) -> zeros(Float32, i), 
-              initγ = (i) -> ones(Float32, i), 
-              affine=true, track_stats=false,
-              ϵ=1f-5, momentum=0.1f0) 
+function GroupNorm(chs::Int, G::Int, λ = identity;
+              initβ = i -> zeros(Float32, i), 
+              initγ = i -> ones(Float32, i), 
+              affine = true, track_stats = false,
+              ϵ = 1f-5, momentum = 0.1f0) 
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, G) : nothing
-  σ² = track_stats ? ones(Float32, G) : nothing
+  β = initβ(chs)
+  γ = initγ(chs)
+  μ = zeros(Float32, G)
+  σ² = ones(Float32, G)
 
   return GroupNorm(chs, G, λ, 
             β, γ,
@@ -425,15 +440,16 @@ end
 
 function (gn::GroupNorm)(x)
   @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == gn.chs
-  N = ndims(x)
+  # @assert size(x, ndims(x) - 1) == gn.chs
   sz = size(x)
-  x = reshape(x, sz[1:N-2]..., sz[N-1]÷gn.G, gn.G, sz[N])
+  x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   N = ndims(x)
   reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N)
-  x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
-  return reshape(x, sz)
+  affine_shape = gn.affine ? ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) : nothing
+  μ, σ² = norm_forward(gn, x;
+                       reduce_dims = reduce_dims)
+  res = affine(l, x, μ, σ², affine_shape)
+  return reshape(res, sz)
 end
 
 testmode!(m::GroupNorm, mode = true) =
@@ -441,17 +457,17 @@ testmode!(m::GroupNorm, mode = true) =
 
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(l.chs), $(l.G)")
-  l.λ == identity || print(io, ", $(l.λ)")
-  hasaffine(l) || print(io,  ", affine=false")
+  print(io, ", $(l.λ)")
+  print(io, ", affine = ")
   print(io, ")")
 end
 
-"""
-  hasaffine(l)
-
-Return `true` if a normalisation layer has trainable shift and 
-scale parameters, `false` otherwise.
-
-See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
-"""
-hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine
\ No newline at end of file
+# """
+#   hasaffine(l)
+# 
+# Return `true` if a normalisation layer has trainable shift and 
+# scale parameters, `false` otherwise.
+# 
+# See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
+# """
+# hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine

From 0ef033ed65670a1b1911d6517810fe67d0accb50 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Feb 2021 14:47:28 +0530
Subject: [PATCH 02/44] bn on cudnn

---
 src/cuda/cudnn.jl | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/cuda/cudnn.jl b/src/cuda/cudnn.jl
index 0494672791..aa1dffb14a 100644
--- a/src/cuda/cudnn.jl
+++ b/src/cuda/cudnn.jl
@@ -1,14 +1,13 @@
 import CUDA.CUDNN: batchnorm, ∇batchnorm
 
-function (BN::Flux.BatchNorm)(x::Union{CuArray{T,2},CuArray{T,4},CuArray{T,5}},
-                              cache=nothing) where T<:Union{Float32, Float64}
+function (BN::Flux.BatchNorm)(x::CuArray{T},
+                              cache = nothing) where T<:Union{Float32, Float64}
   
-  @assert BN.affine "BatchNorm: only affine=true supported on gpu"
-  @assert BN.track_stats "BatchNorm: only track_stats=true supported on gpu"
-  @assert length(BN.β) == size(x, ndims(x)-1) "BatchNorm: input has wronng number of channels"
+  @assert BN.affine throw(ArgumentError("BatchNorm: only affine = true supported on gpu"))
+  @assert BN.track_stats throw(ArgumentError("BatchNorm: only track_stats = true supported on gpu"))
   return BN.λ.(batchnorm(BN.γ, BN.β, x, BN.μ, BN.σ², BN.momentum; 
-                  cache=cache, alpha=1, beta=0, eps=BN.ϵ, 
-                  training=Flux._isactive(BN)))
+                  cache = cache, alpha = 1, beta = 0, eps = BN.ϵ, 
+                  training = Flux._isactive(BN)))
 end
 
 @adjoint function batchnorm(g, b, x, running_mean, running_var, momentum; kw...)

From 1e67700974a428fc2a45b721bdfb6c62783b574e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Feb 2021 14:47:52 +0530
Subject: [PATCH 03/44] rm extra utils

---
 test/cuda/losses.jl   |  2 +-
 test/cuda/runtests.jl | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
index a0f7f47d80..2049e16eee 100644
--- a/test/cuda/losses.jl
+++ b/test/cuda/losses.jl
@@ -31,7 +31,7 @@ y = [1  0  0  0  1
   y = rand(Float32, 3,3)
 
   for loss in ALL_LOSSES
-    gpu_autodiff_test(loss, x, y)
+    gpu_gradtest(loss, x, y)
   end
 end
 
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
index 6a6722b238..9cb70e16f8 100644
--- a/test/cuda/runtests.jl
+++ b/test/cuda/runtests.jl
@@ -4,7 +4,21 @@ using Zygote: pullback
 @info "Testing GPU Support"
 CUDA.allowscalar(false)
 
-include("test_utils.jl")
+function gpu_gradtest(f, args...)
+  args_gpu = gpu.(args)
+
+  l_cpu, back_cpu = pullback((x...) -> f(x...), args...)
+  g_cpu = back_cpu(1f0)[1]
+
+  l_gpu, back_gpu = pullback((x...) -> f(x...), args_gpu...)
+  g_gpu = back_gpu(1f0)[1]
+
+  @test l_cpu ≈ l_gpu   rtol=1e-4 atol=1e-4
+  @test g_gpu isa CuArray
+  @test g_cpu ≈ collect(g_gpu)   rtol=1e-4 atol=1e-4
+end
+
+
 include("cuda.jl")
 include("losses.jl")
 include("layers.jl")
@@ -15,4 +29,4 @@ if CUDA.has_cudnn()
   include("curnn.jl")
 else
   @warn "CUDNN unavailable, not testing GPU DNN support"
-end
\ No newline at end of file
+end

From 0dbeb3936d04375d73503db8b80d7d38f8ef9403 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Feb 2021 14:48:27 +0530
Subject: [PATCH 04/44] use simpler test suite first

---
 test/cuda/layers.jl | 66 +++++----------------------------------------
 1 file changed, 7 insertions(+), 59 deletions(-)

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 31b566720b..a0e6854a12 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -16,23 +16,20 @@ end
 const BROKEN_LAYERS = Union{DepthwiseConv,
                             AlphaDropout}
 
-function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; 
-            setmode=false, test_cpu=true, rtol=1e-5, atol=1e-5)
+function gpu_gradtest(name::String, layers::Vector, x_cpu, args...) 
   @testset "$name GPU grad tests" begin
     for layer in layers
       @testset "$layer GPU grad test" begin
         l_cpu = layer(args...)
+        l_gpu, x_gpu = gpu(l_cpu), gpu(x_cpu)
         if l_cpu isa BROKEN_LAYERS
-          l_gpu, x_gpu = l_cpu |> gpu, x_cpu |> gpu
           @test_broken gradient(() -> sum(l_gpu(x_gpu)), Flux.params(l_gpu)) isa Flux.Zygote.Grads
         else
-          gpu_autodiff_test(l_cpu, x_cpu, 
-              test_equal=test_cpu, rtol=rtol, atol=atol)
-          if setmode
-            testmode!(l_cpu)
-            gpu_autodiff_test(l_cpu, x_cpu, 
-              test_equal=test_cpu, rtol=rtol, atol=atol)
-          end  
+          ps_gpu = Flux.params(l_gpu)
+          ps_cpu = Flux.params(l_cpu)
+          y_gpu, back_gpu = pullback(() -> sum(l_gpu(x_gpu)), ps_gpu)
+          gs_gpu = back_gpu(1.f0)
+          @test gs isa Flux.Zygote.Grads
         end
       end
     end
@@ -79,15 +76,6 @@ gpu_gradtest("GroupNorm 3d", groupnorm, rand(Float32, 8, 8, 8, 12, 4), 12, 3, se
 gpu_gradtest("GroupNorm 2d", groupnorm, rand(Float32, 8, 8, 12, 4), 12, 3, setmode=true)
 gpu_gradtest("GroupNorm 1d", groupnorm, rand(Float32, 8, 3, 12, 4), 12, 3, setmode=true)
 
-upsample = [x -> Upsample(scale=x)]
-gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
-gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
-
-pixelshuffle = [PixelShuffle]
-gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
-gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
-
-
 @testset "function layers" begin
   x = rand(Float32, 3,3)
   gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
@@ -179,43 +167,3 @@ end
   @test l.b ∉ gs.params 
 end
 
-@testset "Two-streams Bilinear" begin
-  x = zeros(Float32,10,9) |> gpu
-  y = zeros(Float32,2,9) |> gpu
-  b = Flux.Bilinear(10, 2, 3) |> gpu
-  @test size(b(x,y)) == (3,9)
-  @test sum(abs2, b(x,y)) ≈ 0f0
-  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
-  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
-  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
-  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
-    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
-  end
-end
-
-@testset "Parallel" begin
-  @testset "zero sum" begin
-    input = randn(10, 10, 10, 10) |> gpu
-    layer_gpu = Parallel(+, zero, identity) |> gpu
-    @test layer_gpu(input) == input
-    @test layer_gpu(input) isa Flux.CUDA.CuArray
-  end
-
-  @testset "vararg input" begin
-    inputs = (randn(10), randn(5), randn(4)) .|> gpu
-    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
-    @test size(layer(inputs)) == (2,)
-  end
-
-  @testset "gradient" begin
-    input_cpu = randn(10, 10, 10, 10)
-    input_gpu = input_cpu |> gpu
-    layer_cpu = Parallel(+, x -> zero(x), identity)
-    layer_gpu = layer_cpu |> gpu
-    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
-    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
-    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
-      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
-    end
-  end
-end

From 04ca0a894d9466c129e3c0da2a38632fcdc001d7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Feb 2021 14:52:35 +0530
Subject: [PATCH 05/44] git fixes

---
 test/cuda/layers.jl | 54 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index a0e6854a12..ae819348ed 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -76,11 +76,19 @@ gpu_gradtest("GroupNorm 3d", groupnorm, rand(Float32, 8, 8, 8, 12, 4), 12, 3, se
 gpu_gradtest("GroupNorm 2d", groupnorm, rand(Float32, 8, 8, 12, 4), 12, 3, setmode=true)
 gpu_gradtest("GroupNorm 1d", groupnorm, rand(Float32, 8, 3, 12, 4), 12, 3, setmode=true)
 
+upsample = [x -> Upsample(scale=x)]
+ gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
+ gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
+
+ pixelshuffle = [PixelShuffle]
+ gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
+ gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
+
 @testset "function layers" begin
   x = rand(Float32, 3,3)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
+  gpu_gradtest(x -> sum(Flux.normalise(x; dims=1)), x)
+  gpu_gradtest(x -> sum(Flux.normalise(x; dims=2)), x)
+  gpu_gradtest(x -> sum(Flux.normalise(x)), x)
 end
 
 @testset "BatchNorm mix stuff" begin
@@ -167,3 +175,43 @@ end
   @test l.b ∉ gs.params 
 end
 
+@testset "Two-streams Bilinear" begin
+  x = zeros(Float32,10,9) |> gpu
+  y = zeros(Float32,2,9) |> gpu
+  b = Flux.Bilinear(10, 2, 3) |> gpu
+  @test size(b(x,y)) == (3,9)
+  @test sum(abs2, b(x,y)) ≈ 0f0
+  gs_gpu = gradient(() -> sum(abs2.(b(x, y))), params(b))
+  b_cpu, x_cpu, y_cpu = b |> cpu, x |> cpu, y |> cpu
+  gs_cpu = gradient(() -> sum(abs2.(b_cpu(x_cpu, y_cpu))), params(b_cpu))
+  for (pgpu, pcpu) in zip(params(b), params(b_cpu))
+    @test gs_cpu[pcpu] ≈ Array(gs_gpu[pgpu])
+  end
+end
+
+@testset "Parallel" begin
+  @testset "zero sum" begin
+    input = randn(10, 10, 10, 10) |> gpu
+    layer_gpu = Parallel(+, zero, identity) |> gpu
+    @test layer_gpu(input) == input
+    @test layer_gpu(input) isa Flux.CUDA.CuArray
+  end
+
+  @testset "vararg input" begin
+    inputs = (randn(10), randn(5), randn(4)) .|> gpu
+    layer = Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2)) |> gpu
+    @test size(layer(inputs)) == (2,)
+  end
+
+  @testset "gradient" begin
+    input_cpu = randn(10, 10, 10, 10)
+    input_gpu = input_cpu |> gpu
+    layer_cpu = Parallel(+, x -> zero(x), identity)
+    layer_gpu = layer_cpu |> gpu
+    gs_cpu = gradient(() -> sum(abs2.(layer_cpu(input_cpu))), params(layer_cpu))
+    gs_gpu = gradient(() -> sum(abs2.(layer_gpu(input_gpu))), params(layer_gpu))
+    for (pgpu, pcpu) in zip(params(layer_cpu), params(layer_gpu))
+      @test gs_cpu[pcpu] ≈ gs_gpu[pgpu]
+    end
+  end
+end

From 31f7000a9186a67d04cf3cf956dacd6e22590055 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 19 Feb 2021 04:28:47 +0530
Subject: [PATCH 06/44] refactor norm_forward

---
 src/layers/normalise.jl | 62 ++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 312927a149..852dc7a1f9 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -166,24 +166,25 @@ end
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function norm_forward(l, x::AbstractArray{T,N}; reduce_dims) where {T, N}
-  if !_isactive(l) && l.track_stats # testmode with tracked stats
+function norm_forward(l, ts, x::AbstractArray{T,N}; reduce_dims) where {T, N}
+  if !_isactive(l) # testmode with tracked stats
     stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
     μ = mean(x; dims = reduce_dims)
     σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
-    if l.track_stats
-      ## update moving mean/std
-
-      μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = reduce_dims)
-      l.μ .= μ
-      l.σ² .= σ²
-    end
+    μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = reduce_dims)
+    l.μ .= μ
+    l.σ² .= σ²
   end
   μ, σ²
-  # affine(l, x, μ, σ², affine_shape)
+end
+
+function norm_forward(l, ::Nothing, x::AbstractArray{T,N}; reduce_dims) where {T, N}
+  μ = mean(x; dims = reduce_dims)
+  σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
+  μ, σ²
 end
 
 function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
@@ -253,9 +254,9 @@ mutable struct BatchNorm{F,V,N,W}
   σ²::W    # moving var
   ϵ::N
   momentum::N
-  # affine::Bool
-  # track_stats::Bool
-  # active::Union{Bool, Nothing}
+  affine::Bool
+  track_stats::Bool
+  active::Union{Bool, Nothing}
 end
 
 function BatchNorm(chs::Int, λ = identity;
@@ -270,19 +271,20 @@ function BatchNorm(chs::Int, λ = identity;
   σ² = ones(Float32, chs)
 
   return BatchNorm(chs, λ, β, γ,
-            μ, σ², ϵ, momentum)
-#             affine, track_stats, nothing)
+            μ, σ², ϵ, momentum,
+            affine, track_stats, nothing)
 end
 
 @functor BatchNorm
-trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
+# trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
 
 function (BN::BatchNorm)(x)
   N = ndims(x)
   @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
   affine_shape = BN.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
-  μ, σ² = norm_forward(BN, x;
+  ts = BN.track_stats ? BN.track_stats : nothing
+  μ, σ² = norm_forward(BN, ts, x;
                        reduce_dims = reduce_dims)
   affine(l, x, μ, σ², affine_shape)
 end
@@ -331,9 +333,9 @@ mutable struct InstanceNorm{F,V,N,W}
   σ²::W  # moving var
   ϵ::N
   momentum::N
-  # affine::Bool
-  # track_stats::Bool
-  # active::Union{Bool, Nothing}
+  affine::Bool
+  track_stats::Bool
+  active::Union{Bool, Nothing}
 end
 
 function InstanceNorm(chs::Int, λ = identity;
@@ -352,15 +354,16 @@ function InstanceNorm(chs::Int, λ = identity;
 end
 
 @functor InstanceNorm
-trainable(in::InstanceNorm) = hasaffine(in) ? (in.β, in.γ) : ()
+# trainable(in::InstanceNorm) = hasaffine(in) ? (in.β, in.γ) : ()
 
 function (l::InstanceNorm)(x)
   @assert ndims(x) > 2
-  # @assert size(x, ndims(x)-1) == l.chs
+  @assert size(x, ndims(x)-1) == l.chs
   N = ndims(x)
   reduce_dims = 1:N-2
   affine_shape = l.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
-  μ, σ² = norm_forward(l, x; reduce_dims = reduce_dims)
+  ts = BN.track_stats ? BN.track_stats : nothing
+  μ, σ² = norm_forward(l, ts, x; reduce_dims = reduce_dims)
   affine(l, x, μ, σ², affine_shape)
 end
 
@@ -410,13 +413,13 @@ mutable struct GroupNorm{F,V,N,W}
   σ²::W    # moving std
   ϵ::N
   momentum::N
-  # affine::Bool
-  # track_stats::Bool
-  # active::Union{Bool, Nothing}
+  affine::Bool
+  track_stats::Bool
+  active::Union{Bool, Nothing}
 end
 
 @functor GroupNorm
-trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
+# trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
 
 function GroupNorm(chs::Int, G::Int, λ = identity;
               initβ = i -> zeros(Float32, i), 
@@ -440,13 +443,14 @@ end
 
 function (gn::GroupNorm)(x)
   @assert ndims(x) > 2
-  # @assert size(x, ndims(x) - 1) == gn.chs
+  @assert size(x, ndims(x) - 1) == gn.chs
   sz = size(x)
   x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   N = ndims(x)
   reduce_dims = 1:N-2
   affine_shape = gn.affine ? ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) : nothing
-  μ, σ² = norm_forward(gn, x;
+  ts = BN.track_stats ? BN.track_stats : nothing
+  μ, σ² = norm_forward(gn, ts, x;
                        reduce_dims = reduce_dims)
   res = affine(l, x, μ, σ², affine_shape)
   return reshape(res, sz)

From 262112a1bd18249cc4a9a9f7802c129e8b76d03f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 20 Feb 2021 17:15:45 +0530
Subject: [PATCH 07/44] simplify tests

---
 test/layers/normalisation.jl | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 89c2f4803e..4caaa66ce9 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -11,22 +11,22 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 
   x = rand(100)
   m = Dropout(0.9)
-  y = evalwgrad(m, x)
+  y = m(x)
   @test count(a->a==0, y) > 50
   testmode!(m, true)
-  y = evalwgrad(m, x) # should override istraining
+  y = m(x) # should override istraining
   @test count(a->a==0, y) == 0
   testmode!(m, false)
-  y = evalwgrad(m, x)
+  y = m(x)
   @test count(a->a==0, y) > 50
 
   x = rand(Float32, 100)
   m = Chain(Dense(100,100),
             Dropout(0.9))
-  y = evalwgrad(m, x)
+  y = m(x)
   @test count(a->a == 0, y) > 50
   testmode!(m, true)
-  y = evalwgrad(m, x) # should override istraining
+  y = m(x) # should override istraining
   @test count(a->a == 0, y) == 0
 
   x = rand(100, 50)
@@ -69,7 +69,7 @@ end
     # initial m.σ is 1
     # initial m.μ is 0
 
-    y = evalwgrad(m, x)
+    y = m(x)
     @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
     # julia> x
     #  2×3 Array{Float64,2}:
@@ -139,7 +139,7 @@ end
       x = Float32.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
-      y = evalwgrad(m, x)
+      y = m(x)
 
       #julia> x
       #[:, :, 1] =
@@ -182,7 +182,7 @@ end
     affine_shape = collect(sizes)
     affine_shape[[1,3]] .= 1
 
-    y = evalwgrad(m, x)
+    y = m(x)
     y = m(x) # inference time after a training step    
     μ = reshape(m.μ, affine_shape...)
     σ² = reshape(m.σ², affine_shape...)
@@ -225,7 +225,7 @@ end
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = InstanceNorm(2; affine=true,track_stats=true), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
-    y = evalwgrad(m, x)
+    y = m(x)
     @test size(m.μ) == (sizes[end - 1], )
     @test size(m.σ²) == (sizes[end - 1], )
     @test size(y) == sizes
@@ -282,7 +282,7 @@ end
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
 
-      y = evalwgrad(m, x)
+      y = m(x)
 
       #julia> x
       #[:, :, 1]  =
@@ -351,7 +351,7 @@ end
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = GroupNorm(4,2, track_stats=true), sizes = (5, 5, 3, 4, 4, 6),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
-    y = evalwgrad(m, x)
+    y = m(x)
     @test size(m.μ) == (m.G,)
     @test size(m.σ²) == (m.G,)
     @test size(y) == sizes

From c61457c3be996f9305908f8c7f756cbe0a87e66a Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 20 Feb 2021 17:17:41 +0530
Subject: [PATCH 08/44] typose

---
 src/layers/normalise.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 852dc7a1f9..e2dd1eadca 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -362,7 +362,7 @@ function (l::InstanceNorm)(x)
   N = ndims(x)
   reduce_dims = 1:N-2
   affine_shape = l.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
-  ts = BN.track_stats ? BN.track_stats : nothing
+  ts = l.track_stats ? l.track_stats : nothing
   μ, σ² = norm_forward(l, ts, x; reduce_dims = reduce_dims)
   affine(l, x, μ, σ², affine_shape)
 end
@@ -449,7 +449,7 @@ function (gn::GroupNorm)(x)
   N = ndims(x)
   reduce_dims = 1:N-2
   affine_shape = gn.affine ? ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) : nothing
-  ts = BN.track_stats ? BN.track_stats : nothing
+  ts = gn.track_stats ? gn.track_stats : nothing
   μ, σ² = norm_forward(gn, ts, x;
                        reduce_dims = reduce_dims)
   res = affine(l, x, μ, σ², affine_shape)

From 0432b82b9f7b8de66015a2019c6ac809eee44562 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 20 Feb 2021 20:05:19 +0530
Subject: [PATCH 09/44] check reduce for batch

---
 src/layers/normalise.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index e2dd1eadca..7e454b89df 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -189,8 +189,8 @@ end
 
 function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
   m = prod(size(x)[reduce_dims])
-  μnew = vec(N ∈ reduce_dims ? μ : mean(μ, dims = N))
-  σ²new = vec(N ∈ reduce_dims ? σ² : mean(σ², dims = N))
+  μnew = vec(N == last(reduce_dims) ? μ : mean(μ, dims = N))
+  σ²new = vec(N == last(reduce_dims) ? σ² : mean(σ², dims = N))
   μ = (1 - mtm) .* μ .+ mtm .* μnew
   σ² = (1 - mtm) .* σ² .+ mtm .* (m / (m - one(T))) .* σ²new
   μ, σ²
@@ -286,7 +286,7 @@ function (BN::BatchNorm)(x)
   ts = BN.track_stats ? BN.track_stats : nothing
   μ, σ² = norm_forward(BN, ts, x;
                        reduce_dims = reduce_dims)
-  affine(l, x, μ, σ², affine_shape)
+  affine(BN, x, μ, σ², affine_shape)
 end
 
 testmode!(m::BatchNorm, mode=true) =
@@ -452,7 +452,7 @@ function (gn::GroupNorm)(x)
   ts = gn.track_stats ? gn.track_stats : nothing
   μ, σ² = norm_forward(gn, ts, x;
                        reduce_dims = reduce_dims)
-  res = affine(l, x, μ, σ², affine_shape)
+  res = affine(gn, x, μ, σ², affine_shape)
   return reshape(res, sz)
 end
 

From 77a8e878bfd85a1e8ba3b83d0fcf51a4d71133c0 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 21 Feb 2021 21:44:54 +0530
Subject: [PATCH 10/44] use normconfig

---
 src/layers/normalise.jl | 42 ++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7e454b89df..eee56f7dc4 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -162,28 +162,41 @@ function Base.show(io::IO, l::LayerNorm)
   print(io, ")")
 end
 
+struct NormConfig{A,T}
+  dims::Vector{Int}
+end
+
+NormConfig(affine, track_stats, reduce_dims) = NormConfig{affine, track_stats}(reduce_dims)
+
+function getaffine(nc::NormConfig{true}, sz_x) where N
+  n = length(sz_x)
+  ntuple(i -> i == n-1 ? sz_x[n-1] : 1, n)
+end
+
+getaffine(nc::NormConfig{false}, args...) = ()
+
 # For InstanceNorm, GroupNorm, and BatchNorm.
 # Compute the statistics on the slices specified by reduce_dims.
 # reduce_dims=[1,...,N-2,N] for BatchNorm
 # reduce_dims=[1,...,N-2] for InstanceNorm and GroupNorm
-function norm_forward(l, ts, x::AbstractArray{T,N}; reduce_dims) where {T, N}
+function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {A, T, N}
   if !_isactive(l) # testmode with tracked stats
     stats_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
-    μ = mean(x; dims = reduce_dims)
-    σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
-    μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = reduce_dims)
+    μ = mean(x; dims = nc.dims)
+    σ² = mean((x .- μ) .^ 2; dims = nc.dims)
+    μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = nc.dims)
     l.μ .= μ
     l.σ² .= σ²
   end
   μ, σ²
 end
 
-function norm_forward(l, ::Nothing, x::AbstractArray{T,N}; reduce_dims) where {T, N}
-  μ = mean(x; dims = reduce_dims)
-  σ² = mean((x .- μ) .^ 2; dims = reduce_dims)
+function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, false}) where {A, T, N}
+  μ = mean(x; dims = nc.dims)
+  σ² = mean((x .- μ) .^ 2; dims = nc.dims)
   μ, σ²
 end
 
@@ -197,13 +210,14 @@ function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T
 end
 @nograd track_stats
 
-function affine(l, x, μ, σ², affine_shape)
+function affine(l, x, μ, σ², nc::NormConfig{true})
+  affine_shape = getaffine(nc, size(x))
   γ = reshape(l.γ, affine_shape)
   β = reshape(l.β, affine_shape)
   l.λ.((γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ)) .+ β)
 end
 
-affine(l, x, μ, σ², affine_shape::Nothing) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
+affine(l, x, μ, σ², nc::NormConfig{false}) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
 
 # function affine(l, x, μ, σ², affine_shape)
 #   res = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
@@ -279,14 +293,12 @@ end
 # trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
 
 function (BN::BatchNorm)(x)
-  N = ndims(x)
+  N = ndims(x)::Int
   @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
-  affine_shape = BN.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
-  ts = BN.track_stats ? BN.track_stats : nothing
-  μ, σ² = norm_forward(BN, ts, x;
-                       reduce_dims = reduce_dims)
-  affine(BN, x, μ, σ², affine_shape)
+  nc = NormConfig(BN.affine, BN.track_stats, reduce_dims)
+  μ, σ² = norm_forward(BN, x, nc)
+  affine(BN, x, μ, σ², nc)
 end
 
 testmode!(m::BatchNorm, mode=true) =

From d0961a72eddade5376793acf14a6746efa95ed12 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sun, 21 Feb 2021 22:00:33 +0530
Subject: [PATCH 11/44] use normconfig for other layers

---
 src/layers/normalise.jl | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index eee56f7dc4..aec3ead484 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -373,10 +373,9 @@ function (l::InstanceNorm)(x)
   @assert size(x, ndims(x)-1) == l.chs
   N = ndims(x)
   reduce_dims = 1:N-2
-  affine_shape = l.affine ? ntuple(i -> i == N-1 ? size(x, N-1) : 1, N) : nothing
-  ts = l.track_stats ? l.track_stats : nothing
-  μ, σ² = norm_forward(l, ts, x; reduce_dims = reduce_dims)
-  affine(l, x, μ, σ², affine_shape)
+  nc = NormConfig(l.affine, l.track_stats, reduce_dims)
+  μ, σ² = norm_forward(l, x, nc)
+  affine(l, x, μ, σ², nc)
 end
 
 testmode!(m::InstanceNorm, mode=true) =
@@ -457,14 +456,12 @@ function (gn::GroupNorm)(x)
   @assert ndims(x) > 2
   @assert size(x, ndims(x) - 1) == gn.chs
   sz = size(x)
-  x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   N = ndims(x)
+  x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   reduce_dims = 1:N-2
-  affine_shape = gn.affine ? ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N) : nothing
-  ts = gn.track_stats ? gn.track_stats : nothing
-  μ, σ² = norm_forward(gn, ts, x;
-                       reduce_dims = reduce_dims)
-  res = affine(gn, x, μ, σ², affine_shape)
+  nc = NormConfig(gn.affine, gn.track_stats, reduce_dims)
+  μ, σ² = norm_forward(gn, x, nc)
+  res = affine(gn, x, μ, σ², nc)
   return reshape(res, sz)
 end
 

From b091a80efcb049a28bf02d20e5d9cad6b7bf84f3 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 22 Feb 2021 01:40:22 +0530
Subject: [PATCH 12/44] typo

---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index aec3ead484..f1c42bc52f 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -168,7 +168,7 @@ end
 
 NormConfig(affine, track_stats, reduce_dims) = NormConfig{affine, track_stats}(reduce_dims)
 
-function getaffine(nc::NormConfig{true}, sz_x) where N
+function getaffine(nc::NormConfig{true}, sz_x)
   n = length(sz_x)
   ntuple(i -> i == n-1 ? sz_x[n-1] : 1, n)
 end

From 680e64f379035edb991660627d63099214c6fae1 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 22 Feb 2021 02:55:01 +0530
Subject: [PATCH 13/44] backwards

---
 src/layers/normalise.jl | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index f1c42bc52f..14bdc2c0a2 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -185,11 +185,13 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
-    μ = mean(x; dims = nc.dims)
-    σ² = mean((x .- μ) .^ 2; dims = nc.dims)
-    μ, σ² = track_stats(x, μ, σ², l.momentum, reduce_dims = nc.dims)
-    l.μ .= μ
-    l.σ² .= σ²
+    μ = mean(x; dims = getaffine(nc, size(x)))
+    σ² = sum((x .- μ) .^ 2; dims = getaffine(nc, size(x))) ./ l.chs
+    μ, σ² = track_stats(x, l.μ, l.σ², l.momentum, reduce_dims = nc.dims)
+    Zygote.ignore() do
+      l.μ = reshape(μ, :)
+      l.σ² = reshape(σ², :)
+    end
   end
   μ, σ²
 end
@@ -201,7 +203,7 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, false}) where
 end
 
 function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
-  m = prod(size(x)[reduce_dims])
+  m = prod(size(x)[collect(reduce_dims)])
   μnew = vec(N == last(reduce_dims) ? μ : mean(μ, dims = N))
   σ²new = vec(N == last(reduce_dims) ? σ² : mean(σ², dims = N))
   μ = (1 - mtm) .* μ .+ mtm .* μnew
@@ -214,7 +216,10 @@ function affine(l, x, μ, σ², nc::NormConfig{true})
   affine_shape = getaffine(nc, size(x))
   γ = reshape(l.γ, affine_shape)
   β = reshape(l.β, affine_shape)
-  l.λ.((γ .* (x .- μ) ./ sqrt.(σ² .+ l.ϵ)) .+ β)
+  μ = reshape(μ, affine_shape)
+  σ² = reshape(σ², affine_shape)
+  x̂ = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
+  l.λ.(γ .* x̂ .+ β)
 end
 
 affine(l, x, μ, σ², nc::NormConfig{false}) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
@@ -297,6 +302,7 @@ function (BN::BatchNorm)(x)
   @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
   nc = NormConfig(BN.affine, BN.track_stats, reduce_dims)
+  @show nc
   μ, σ² = norm_forward(BN, x, nc)
   affine(BN, x, μ, σ², nc)
 end

From 4a265597f9ecd54c865cddec2b94175bd3c22b70 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 1 Mar 2021 17:12:16 +0530
Subject: [PATCH 14/44] first pass

---
 test/layers/normalisation.jl | 72 ++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 41 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 4caaa66ce9..2e802f5785 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -12,31 +12,31 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   x = rand(100)
   m = Dropout(0.9)
   y = m(x)
-  @test count(a->a==0, y) > 50
+  @test count(a -> a == 0, y) > 50
   testmode!(m, true)
   y = m(x) # should override istraining
-  @test count(a->a==0, y) == 0
+  @test count(a -> a == 0, y) == 0
   testmode!(m, false)
   y = m(x)
-  @test count(a->a==0, y) > 50
+  @test count(a -> a == 0, y) > 50
 
   x = rand(Float32, 100)
   m = Chain(Dense(100,100),
             Dropout(0.9))
   y = m(x)
-  @test count(a->a == 0, y) > 50
+  @test count(a -> a == 0, y) > 50
   testmode!(m, true)
   y = m(x) # should override istraining
-  @test count(a->a == 0, y) == 0
+  @test count(a -> a == 0, y) == 0
 
   x = rand(100, 50)
   m = Dropout(0.5, dims = 2)
   y = m(x)
-  c = map(i->count(a->a==0, @view y[i, :]), 1:100)
+  c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
   @test minimum(c) == maximum(c)
   m = Dropout(0.5, dims = 1)
   y = m(x)
-  c = map(i->count(a->a==0, @view y[:, i]), 1:50)
+  c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
   @test minimum(c) == maximum(c)
 
   # issue #1084
@@ -45,24 +45,22 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 
   testmode!(m)
   y = m(x)
-  @test count(a->a == 0, y) == 0
+  @test count(a -> a == 0, y) == 0
   trainmode!(m)
   y = m(x)
-  @test count(a->a == 0, y) > 50
+  @test count(a -> a == 0, y) > 50
 
-  y = Flux.dropout(x, 0.9, active=true)
-  @test count(a->a == 0, y) > 50
+  y = Flux.dropout(x, 0.9, active = true)
+  @test count(a -> a == 0, y) > 50
 
-  y = Flux.dropout(x, 0.9, active=false)
-  @test count(a->a == 0, y) == 0
+  y = Flux.dropout(x, 0.9, active = false)
+  @test count(a -> a == 0, y) == 0
 end
 
 @testset "BatchNorm" begin
   let m = BatchNorm(2), x = [1.0 3.0 5.0;
                              2.0 4.0 6.0]
 
-    @test Flux.hasaffine(m) == true
-    @test length(params(m)) == 2
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -125,14 +123,14 @@ end
     @test (@allocated m(x)) <  100_000_000
   end
 
-  @test length(Flux.params(BatchNorm(10))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-  @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+  # @test length(Flux.params(BatchNorm(10))) == 2
+  # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
+  # @test length(Flux.params(BatchNorm(10, affine=false))) == 0
 end
 
 @testset "InstanceNorm" begin
   # begin tests
-  let m = InstanceNorm(2; affine=true, track_stats=true), sizes = (3, 2, 2),
+  let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
       @test length(params(m)) == 2
@@ -176,7 +174,7 @@ end
   end
 
   # with activation function
-  let m = InstanceNorm(2, sigmoid; affine=true, track_stats=true), sizes = (3, 2, 2),
+  let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
     x = Float64.(x)
     affine_shape = collect(sizes)
@@ -190,22 +188,18 @@ end
   end
 
   # with activation function
-  let m = InstanceNorm(2, sigmoid; affine=true, track_stats=false), sizes = (3, 2, 2),
+  let m = InstanceNorm(2, sigmoid; affine = true, track_stats = false), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
 
-    @test Flux.hasaffine(m) == true
-    @test length(params(m)) == 2  
     x = Float64.(x)
     y = m(x)
     μ = mean(x, dims=1)
-    σ² = var(x, dims=1, corrected=false)
+    σ² = var(x, dims=1, corrected = false)
     @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7
   end
 
   let m = InstanceNorm(2, sigmoid), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
-    @test Flux.hasaffine(m) == false
-    @test length(params(m)) == 0
     
     x = Float64.(x)
     y = m(x)
@@ -215,7 +209,7 @@ end
   end
 
 
-  let m = trainmode!(InstanceNorm(2; affine=true)), sizes = (2, 4, 1, 2, 3),
+  let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
     y = reshape(m(y), sizes...)
@@ -223,7 +217,7 @@ end
   end
 
   # check that μ, σ², and the output are the correct size for higher rank tensors
-  let m = InstanceNorm(2; affine=true,track_stats=true), sizes = (5, 5, 3, 4, 2, 6),
+  let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(collect(1:prod(sizes))), sizes)
     y = m(x)
     @test size(m.μ) == (sizes[end - 1], )
@@ -243,31 +237,27 @@ end
   end
 
   @test length(Flux.params(InstanceNorm(10))) == 0
-  @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
-  @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
+  @test length(Flux.params(InstanceNorm(10, affine = true))) == 2
+  @test length(Flux.params(InstanceNorm(10, affine =false))) == 0
 end
 
 @testset "LayerNorm" begin
   x = rand(2,3)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
+  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
   x = rand(2,3,4)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
+  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
   x = rand(2,3,4,5)
-  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims=1)
+  @test LayerNorm(2)(x) ≈ Flux.normalise(x, dims = 1)
   x = rand(2)
-  @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims=1))
+  @test LayerNorm(2, tanh)(x) ≈ tanh.(Flux.normalise(x, dims = 1))
 
   x = rand(2,3,4,5)
-  @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims=(1,2))
+  @test LayerNorm((2,3))(x) ≈ Flux.normalise(x, dims = (1,2))
   x = rand(2,3,4,5)
-  @test LayerNorm((2,3,4))(x) ≈ Flux.normalise(x, dims=1:3)
+  @test LayerNorm((2,3,4))(x) ≈ Flux.normalise(x, dims = 1:3)
 
   m = LayerNorm((2,3,4))
-  @test Flux.hasaffine(m) == true
-  @test length(params(m)) == 2
-  m = LayerNorm((2,3,4), affine=false)
-  @test Flux.hasaffine(m) == false
-  @test length(params(m)) == 0
+  m = LayerNorm((2,3,4), affine = false)
 end
 
 @testset "GroupNorm" begin

From 1c38029b2689d749e118e9027a24ae7aaf40f25f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Sun, 7 Mar 2021 15:35:06 +0530
Subject: [PATCH 15/44] Update src/layers/normalise.jl

Co-authored-by: Carlo Lucibello <carlo.lucibello@unibocconi.it>
---
 src/layers/normalise.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 14bdc2c0a2..dd0cf20c16 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -302,7 +302,6 @@ function (BN::BatchNorm)(x)
   @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
   nc = NormConfig(BN.affine, BN.track_stats, reduce_dims)
-  @show nc
   μ, σ² = norm_forward(BN, x, nc)
   affine(BN, x, μ, σ², nc)
 end

From 137acf9353d3b5f9302838ddcadb83fdaff47ac6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 9 Mar 2021 02:04:33 +0530
Subject: [PATCH 16/44] don't use single instance in bn

---
 test/layers/normalisation.jl | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 2e802f5785..1323f1c114 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -12,7 +12,8 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   x = rand(100)
   m = Dropout(0.9)
   y = m(x)
-  @test count(a -> a == 0, y) > 50
+  # By default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
   testmode!(m, true)
   y = m(x) # should override istraining
   @test count(a -> a == 0, y) == 0
@@ -24,7 +25,8 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
   m = Chain(Dense(100,100),
             Dropout(0.9))
   y = m(x)
-  @test count(a -> a == 0, y) > 50
+  # by default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
   testmode!(m, true)
   y = m(x) # should override istraining
   @test count(a -> a == 0, y) == 0
@@ -58,9 +60,7 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 end
 
 @testset "BatchNorm" begin
-  let m = BatchNorm(2), x = [1.0 3.0 5.0;
-                             2.0 4.0 6.0]
-
+  let m = BatchNorm(2), x = reshape(1:6, 1,1,2,3)
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -94,34 +94,33 @@ end
   end
 
   # with activation function
-  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                      2.0 4.0 6.0]
+  let m = BatchNorm(2, sigmoid), x = reshape(1:6, 1,1,3,2)
     y = m(x)
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
   end
 
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
+  let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
     @test m(x) == y
   end
 
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+  let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
     @test m(x) == y
   end
 
-  let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+  let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
     @test m(x) == y
   end
 
-  let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
-    m(x)
-    @test (@allocated m(x)) <  100_000_000
-  end
+  # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
+  #   m(x)
+  #   @test (@allocated m(x)) <  100_000_000
+  # end
 
   # @test length(Flux.params(BatchNorm(10))) == 2
   # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
@@ -133,7 +132,7 @@ end
   let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(params(m)) == 2
+      # @test length(params(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0]  # initβ(2)
       @test m.γ == [1, 1]  # initγ(2)
@@ -267,7 +266,7 @@ end
   let m = GroupNorm(4,2, track_stats=true), sizes = (3,4,2),
         x = reshape(collect(1:prod(sizes)), sizes)
 
-      @test length(params(m)) == 2
+      # @test length(params(m)) == 2
       x = Float32.(x)
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)

From db559c5003e2d4c132b3735db1d300968f77a102 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 9 Mar 2021 02:05:30 +0530
Subject: [PATCH 17/44] use prev stats to track

---
 src/layers/normalise.jl | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 14bdc2c0a2..11db9895bb 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -185,9 +185,11 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
     μ = reshape(l.μ, stats_shape)
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
-    μ = mean(x; dims = getaffine(nc, size(x)))
-    σ² = sum((x .- μ) .^ 2; dims = getaffine(nc, size(x))) ./ l.chs
-    μ, σ² = track_stats(x, l.μ, l.σ², l.momentum, reduce_dims = nc.dims)
+    μ = mean(x; dims = nc.dims)
+    σ² = sum((x .- μ) .^ 2; dims = nc.dims) ./ l.chs
+    μ, σ² = track_stats(x, (l.μ, l.σ²), (reshape(μ, size(l.μ)...), 
+                        reshape(σ², size(l.σ²)...)),
+                        l.momentum, reduce_dims = nc.dims)
     Zygote.ignore() do
       l.μ = reshape(μ, :)
       l.σ² = reshape(σ², :)
@@ -202,13 +204,13 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, false}) where
   μ, σ²
 end
 
-function track_stats(x::AbstractArray{T,N}, μ, σ², mtm; reduce_dims) where {T,N}
+function track_stats(x::AbstractArray{T,N}, (μprev, σ²prev), (μ, σ²), mtm; reduce_dims) where {T,N}
   m = prod(size(x)[collect(reduce_dims)])
-  μnew = vec(N == last(reduce_dims) ? μ : mean(μ, dims = N))
-  σ²new = vec(N == last(reduce_dims) ? σ² : mean(σ², dims = N))
-  μ = (1 - mtm) .* μ .+ mtm .* μnew
-  σ² = (1 - mtm) .* σ² .+ mtm .* (m / (m - one(T))) .* σ²new
-  μ, σ²
+  μnew = vec((N in reduce_dims) ? μ : mean(μ, dims = N))
+  σ²new = vec((N in reduce_dims) ? σ² : mean(σ², dims = N))
+  μ_ = (1 - mtm) .* μprev .+ mtm .* μnew
+  σ²_ = (1 - mtm) .* σ²prev .+ mtm .* (m / (m - one(T))) .* σ²new
+  μ_, σ²_
 end
 @nograd track_stats
 
@@ -302,7 +304,6 @@ function (BN::BatchNorm)(x)
   @assert size(x, N - 1) == BN.chs
   reduce_dims = [1:N-2; N]
   nc = NormConfig(BN.affine, BN.track_stats, reduce_dims)
-  @show nc
   μ, σ² = norm_forward(BN, x, nc)
   affine(BN, x, μ, σ², nc)
 end

From fcea84193939fb220d3b25d93283048ef814d7cd Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 9 Mar 2021 02:10:23 +0530
Subject: [PATCH 18/44] track stats tests

---
 test/layers/normalisation.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 1323f1c114..4b15a3dd61 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -60,7 +60,7 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 end
 
 @testset "BatchNorm" begin
-  let m = BatchNorm(2), x = reshape(1:6, 1,1,2,3)
+  let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
 
     @test m.β == [0, 0]  # initβ(2)
     @test m.γ == [1, 1]  # initγ(2)
@@ -81,13 +81,15 @@ end
     # ∴ update rule with momentum:
     #  .1 * 3 + 0 = .3
     #  .1 * 4 + 0 = .4
+    m = BatchNorm(2, track_stats = true)
+    gs = gradient((m,x) -> sum(m(x)), m, x)
     @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
 
     # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
     # 2×1 Array{Float64,2}:
     #  1.3
     #  1.3
-    @test m.σ² ≈ .1 .* var(x, dims=2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
+    @test m.σ² ≈ .1 .* var(x, dimsi = 4, corrected = false) .* (3 / 2).+ .9 .* [1., 1.]
     
     x′ = m(x)
     @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)

From 647dcd99906d67ed1822ad26ee7cbda8a74f422f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 10 Mar 2021 02:35:56 +0530
Subject: [PATCH 19/44] fix BN(3)

---
 test/layers/normalisation.jl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 4b15a3dd61..770aac88ee 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -68,7 +68,7 @@ end
     # initial m.μ is 0
 
     y = m(x)
-    @test isapprox(y, [-1.22474 0 1.22474; -1.22474 0 1.22474], atol = 1.0e-5)
+    @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
     # julia> x
     #  2×3 Array{Float64,2}:
     #  1.0  3.0  5.0
@@ -85,38 +85,39 @@ end
     gs = gradient((m,x) -> sum(m(x)), m, x)
     @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
 
-    # julia> .1 .* var(x, dims = 2, corrected=false) .* (3 / 2).+ .9 .* [1., 1.]
+    # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
     # 2×1 Array{Float64,2}:
-    #  1.3
-    #  1.3
-    @test m.σ² ≈ .1 .* var(x, dimsi = 4, corrected = false) .* (3 / 2).+ .9 .* [1., 1.]
-    
+    #  1.5
+    #  1.5
+    v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
+    @test m.σ² ≈ v
+   
     x′ = m(x)
-    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)
+    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
   end
 
   # with activation function
-  let m = BatchNorm(2, sigmoid), x = reshape(1:6, 1,1,3,2)
+  let m = trainmode!(BatchNorm(3, sigmoid)), x = reshape(1:6, 1,1,3,2)
     y = m(x)
-    @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
+    @test_broken isapprox(y, mean(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), dims = 1), atol = 1.0e-7)
   end
 
   let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
-    @test m(x) == y
+    @test m(x) ≈ y
   end
 
   let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
-    @test m(x) == y
+    @test m(x) ≈ y
   end
 
   let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
-    @test m(x) == y
+    @test m(x) ≈ y
   end
 
   # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);

From 561c94f3b585d6cc4f740a12f2bf99886a55ad34 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Thu, 11 Mar 2021 01:32:17 +0530
Subject: [PATCH 20/44] check instance norm tests

---
 test/layers/normalisation.jl | 252 +++++++++++++++++------------------
 1 file changed, 126 insertions(+), 126 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 770aac88ee..b7cb96e5f4 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,132 +3,132 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-@testset "Dropout" begin
-  x = [1.,2.,3.]
-  @test x == Dropout(0.1)(x)
-  @test x == evalwgrad(Dropout(0), x)
-  @test zero(x) == evalwgrad(Dropout(1), x)
-
-  x = rand(100)
-  m = Dropout(0.9)
-  y = m(x)
-  # By default no dropout is performed outside training
-  # @test count(a -> a == 0, y) > 50
-  testmode!(m, true)
-  y = m(x) # should override istraining
-  @test count(a -> a == 0, y) == 0
-  testmode!(m, false)
-  y = m(x)
-  @test count(a -> a == 0, y) > 50
-
-  x = rand(Float32, 100)
-  m = Chain(Dense(100,100),
-            Dropout(0.9))
-  y = m(x)
-  # by default no dropout is performed outside training
-  # @test count(a -> a == 0, y) > 50
-  testmode!(m, true)
-  y = m(x) # should override istraining
-  @test count(a -> a == 0, y) == 0
-
-  x = rand(100, 50)
-  m = Dropout(0.5, dims = 2)
-  y = m(x)
-  c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-  @test minimum(c) == maximum(c)
-  m = Dropout(0.5, dims = 1)
-  y = m(x)
-  c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
-  @test minimum(c) == maximum(c)
-
-  # issue #1084
-  m = Dropout(0.9)
-  x = rand(100)
-
-  testmode!(m)
-  y = m(x)
-  @test count(a -> a == 0, y) == 0
-  trainmode!(m)
-  y = m(x)
-  @test count(a -> a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active = true)
-  @test count(a -> a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active = false)
-  @test count(a -> a == 0, y) == 0
-end
-
-@testset "BatchNorm" begin
-  let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
-
-    @test m.β == [0, 0]  # initβ(2)
-    @test m.γ == [1, 1]  # initγ(2)
-    # initial m.σ is 1
-    # initial m.μ is 0
-
-    y = m(x)
-    @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
-    # julia> x
-    #  2×3 Array{Float64,2}:
-    #  1.0  3.0  5.0
-    #  2.0  4.0  6.0
-    #
-    # μ of batch will be
-    #  (1. + 3. + 5.) / 3 = 3
-    #  (2. + 4. + 6.) / 3 = 4
-    #
-    # ∴ update rule with momentum:
-    #  .1 * 3 + 0 = .3
-    #  .1 * 4 + 0 = .4
-    m = BatchNorm(2, track_stats = true)
-    gs = gradient((m,x) -> sum(m(x)), m, x)
-    @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
-
-    # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
-    # 2×1 Array{Float64,2}:
-    #  1.5
-    #  1.5
-    v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
-    @test m.σ² ≈ v
-   
-    x′ = m(x)
-    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
-  end
-
-  # with activation function
-  let m = trainmode!(BatchNorm(3, sigmoid)), x = reshape(1:6, 1,1,3,2)
-    y = m(x)
-    @test_broken isapprox(y, mean(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), dims = 1), atol = 1.0e-7)
-  end
-
-  let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
-    y = reshape(permutedims(x, [2, 1, 3]), 2, :)
-    y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
-    @test m(x) ≈ y
-  end
-
-  let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
-    y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
-    y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
-    @test m(x) ≈ y
-  end
-
-  let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
-    y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
-    y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
-    @test m(x) ≈ y
-  end
-
-  # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
-  #   m(x)
-  #   @test (@allocated m(x)) <  100_000_000
-  # end
-
-  # @test length(Flux.params(BatchNorm(10))) == 2
-  # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-  # @test length(Flux.params(BatchNorm(10, affine=false))) == 0
-end
+# @testset "Dropout" begin
+#   x = [1.,2.,3.]
+#   @test x == Dropout(0.1)(x)
+#   @test x == evalwgrad(Dropout(0), x)
+#   @test zero(x) == evalwgrad(Dropout(1), x)
+# 
+#   x = rand(100)
+#   m = Dropout(0.9)
+#   y = m(x)
+#   # By default no dropout is performed outside training
+#   # @test count(a -> a == 0, y) > 50
+#   testmode!(m, true)
+#   y = m(x) # should override istraining
+#   @test count(a -> a == 0, y) == 0
+#   testmode!(m, false)
+#   y = m(x)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   x = rand(Float32, 100)
+#   m = Chain(Dense(100,100),
+#             Dropout(0.9))
+#   y = m(x)
+#   # by default no dropout is performed outside training
+#   # @test count(a -> a == 0, y) > 50
+#   testmode!(m, true)
+#   y = m(x) # should override istraining
+#   @test count(a -> a == 0, y) == 0
+# 
+#   x = rand(100, 50)
+#   m = Dropout(0.5, dims = 2)
+#   y = m(x)
+#   c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+#   @test minimum(c) == maximum(c)
+#   m = Dropout(0.5, dims = 1)
+#   y = m(x)
+#   c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
+#   @test minimum(c) == maximum(c)
+# 
+#   # issue #1084
+#   m = Dropout(0.9)
+#   x = rand(100)
+# 
+#   testmode!(m)
+#   y = m(x)
+#   @test count(a -> a == 0, y) == 0
+#   trainmode!(m)
+#   y = m(x)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   y = Flux.dropout(x, 0.9, active = true)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   y = Flux.dropout(x, 0.9, active = false)
+#   @test count(a -> a == 0, y) == 0
+# end
+# 
+# @testset "BatchNorm" begin
+#   let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
+# 
+#     @test m.β == [0, 0]  # initβ(2)
+#     @test m.γ == [1, 1]  # initγ(2)
+#     # initial m.σ is 1
+#     # initial m.μ is 0
+# 
+#     y = m(x)
+#     @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
+#     # julia> x
+#     #  2×3 Array{Float64,2}:
+#     #  1.0  3.0  5.0
+#     #  2.0  4.0  6.0
+#     #
+#     # μ of batch will be
+#     #  (1. + 3. + 5.) / 3 = 3
+#     #  (2. + 4. + 6.) / 3 = 4
+#     #
+#     # ∴ update rule with momentum:
+#     #  .1 * 3 + 0 = .3
+#     #  .1 * 4 + 0 = .4
+#     m = BatchNorm(2, track_stats = true)
+#     gs = gradient((m,x) -> sum(m(x)), m, x)
+#     @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
+# 
+#     # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
+#     # 2×1 Array{Float64,2}:
+#     #  1.5
+#     #  1.5
+#     v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
+#     @test m.σ² ≈ v
+#    
+#     x′ = m(x)
+#     @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
+#   end
+# 
+#   # with activation function
+#   let m = trainmode!(BatchNorm(3, sigmoid)), x = reshape(1:6, 1,1,3,2)
+#     y = m(x)
+#     @test_broken isapprox(y, mean(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), dims = 1), atol = 1.0e-7)
+#   end
+# 
+#   let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
+#     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
+#     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
+#     @test m(x) ≈ y
+#   end
+# 
+#   let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+#     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
+#     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
+#     @test m(x) ≈ y
+#   end
+# 
+#   let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+#     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
+#     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
+#     @test m(x) ≈ y
+#   end
+# 
+#   # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
+#   #   m(x)
+#   #   @test (@allocated m(x)) <  100_000_000
+#   # end
+# 
+#   # @test length(Flux.params(BatchNorm(10))) == 2
+#   # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
+#   # @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+# end
 
 @testset "InstanceNorm" begin
   # begin tests

From 332b13b1ca6c79b872159bee16234f7fdea9fb80 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 13 Mar 2021 02:43:49 +0530
Subject: [PATCH 21/44] clean up instance norm

---
 test/layers/normalisation.jl | 349 +++++++++++++++++------------------
 1 file changed, 172 insertions(+), 177 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index b7cb96e5f4..06e62b92c1 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,178 +3,178 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-# @testset "Dropout" begin
-#   x = [1.,2.,3.]
-#   @test x == Dropout(0.1)(x)
-#   @test x == evalwgrad(Dropout(0), x)
-#   @test zero(x) == evalwgrad(Dropout(1), x)
-# 
-#   x = rand(100)
-#   m = Dropout(0.9)
-#   y = m(x)
-#   # By default no dropout is performed outside training
-#   # @test count(a -> a == 0, y) > 50
-#   testmode!(m, true)
-#   y = m(x) # should override istraining
-#   @test count(a -> a == 0, y) == 0
-#   testmode!(m, false)
-#   y = m(x)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   x = rand(Float32, 100)
-#   m = Chain(Dense(100,100),
-#             Dropout(0.9))
-#   y = m(x)
-#   # by default no dropout is performed outside training
-#   # @test count(a -> a == 0, y) > 50
-#   testmode!(m, true)
-#   y = m(x) # should override istraining
-#   @test count(a -> a == 0, y) == 0
-# 
-#   x = rand(100, 50)
-#   m = Dropout(0.5, dims = 2)
-#   y = m(x)
-#   c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-#   @test minimum(c) == maximum(c)
-#   m = Dropout(0.5, dims = 1)
-#   y = m(x)
-#   c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
-#   @test minimum(c) == maximum(c)
-# 
-#   # issue #1084
-#   m = Dropout(0.9)
-#   x = rand(100)
-# 
-#   testmode!(m)
-#   y = m(x)
-#   @test count(a -> a == 0, y) == 0
-#   trainmode!(m)
-#   y = m(x)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   y = Flux.dropout(x, 0.9, active = true)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   y = Flux.dropout(x, 0.9, active = false)
-#   @test count(a -> a == 0, y) == 0
-# end
-# 
-# @testset "BatchNorm" begin
-#   let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
-# 
-#     @test m.β == [0, 0]  # initβ(2)
-#     @test m.γ == [1, 1]  # initγ(2)
-#     # initial m.σ is 1
-#     # initial m.μ is 0
-# 
-#     y = m(x)
-#     @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
-#     # julia> x
-#     #  2×3 Array{Float64,2}:
-#     #  1.0  3.0  5.0
-#     #  2.0  4.0  6.0
-#     #
-#     # μ of batch will be
-#     #  (1. + 3. + 5.) / 3 = 3
-#     #  (2. + 4. + 6.) / 3 = 4
-#     #
-#     # ∴ update rule with momentum:
-#     #  .1 * 3 + 0 = .3
-#     #  .1 * 4 + 0 = .4
-#     m = BatchNorm(2, track_stats = true)
-#     gs = gradient((m,x) -> sum(m(x)), m, x)
-#     @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
-# 
-#     # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
-#     # 2×1 Array{Float64,2}:
-#     #  1.5
-#     #  1.5
-#     v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
-#     @test m.σ² ≈ v
-#    
-#     x′ = m(x)
-#     @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
-#   end
-# 
-#   # with activation function
-#   let m = trainmode!(BatchNorm(3, sigmoid)), x = reshape(1:6, 1,1,3,2)
-#     y = m(x)
-#     @test_broken isapprox(y, mean(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), dims = 1), atol = 1.0e-7)
-#   end
-# 
-#   let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
-#     y = reshape(permutedims(x, [2, 1, 3]), 2, :)
-#     y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
-#     @test m(x) ≈ y
-#   end
-# 
-#   let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
-#     y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
-#     y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
-#     @test m(x) ≈ y
-#   end
-# 
-#   let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
-#     y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
-#     y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
-#     @test m(x) ≈ y
-#   end
-# 
-#   # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
-#   #   m(x)
-#   #   @test (@allocated m(x)) <  100_000_000
-#   # end
-# 
-#   # @test length(Flux.params(BatchNorm(10))) == 2
-#   # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
-#   # @test length(Flux.params(BatchNorm(10, affine=false))) == 0
-# end
+@testset "Dropout" begin
+  x = [1.,2.,3.]
+  @test x == Dropout(0.1)(x)
+  @test x == evalwgrad(Dropout(0), x)
+  @test zero(x) == evalwgrad(Dropout(1), x)
+
+  x = rand(100)
+  m = Dropout(0.9)
+  y = m(x)
+  # By default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
+  testmode!(m, true)
+  y = m(x) # should override istraining
+  @test count(a -> a == 0, y) == 0
+  testmode!(m, false)
+  y = m(x)
+  @test count(a -> a == 0, y) > 50
+
+  x = rand(Float32, 100)
+  m = Chain(Dense(100,100),
+            Dropout(0.9))
+  y = m(x)
+  # by default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
+  testmode!(m, true)
+  y = m(x) # should override istraining
+  @test count(a -> a == 0, y) == 0
+
+  x = rand(100, 50)
+  m = Dropout(0.5, dims = 2)
+  y = m(x)
+  c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+  @test minimum(c) == maximum(c)
+  m = Dropout(0.5, dims = 1)
+  y = m(x)
+  c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
+  @test minimum(c) == maximum(c)
+
+  # issue #1084
+  m = Dropout(0.9)
+  x = rand(100)
+
+  testmode!(m)
+  y = m(x)
+  @test count(a -> a == 0, y) == 0
+  trainmode!(m)
+  y = m(x)
+  @test count(a -> a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active = true)
+  @test count(a -> a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active = false)
+  @test count(a -> a == 0, y) == 0
+end
 
-@testset "InstanceNorm" begin
-  # begin tests
-  let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (3, 2, 2),
-        x = reshape(collect(1:prod(sizes)), sizes)
+@testset "BatchNorm" begin
+  let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
 
-      # @test length(params(m)) == 2
-      x = Float32.(x)
-      @test m.β == [0, 0]  # initβ(2)
-      @test m.γ == [1, 1]  # initγ(2)
-      y = m(x)
+    @test m.β == [0, 0]  # initβ(2)
+    @test m.γ == [1, 1]  # initγ(2)
+    # initial m.σ is 1
+    # initial m.μ is 0
 
-      #julia> x
-      #[:, :, 1] =
-      # 1.0  4.0
-      # 2.0  5.0
-      # 3.0  6.0
-      #
-      #[:, :, 2] =
-      # 7.0  10.0
-      # 8.0  11.0
-      # 9.0  12.0
-      #
-      # μ will be
-      # (1. + 2. + 3.) / 3 = 2.
-      # (4. + 5. + 6.) / 3 = 5.
-      #
-      # (7. + 8. + 9.) / 3 = 8.
-      # (10. + 11. + 12.) / 3 = 11.
-      #
-      # ∴ update rule with momentum:
-      # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5
-      # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
-      N = ndims(x)
-      @test m.μ ≈ [0.5, 0.8]
-      n = prod(size(x,i) for i in 1:N-2)
-      corr = n / (n-1)
-      σ² = var(x, dims=1:N-2, corrected=false)
-      @test m.σ² ≈ 0.1*corr*vec(mean(σ², dims=N)) .+ 0.9 * 1
+    y = m(x)
+    @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
+    # julia> x
+    #  2×3 Array{Float64,2}:
+    #  1.0  3.0  5.0
+    #  2.0  4.0  6.0
+    #
+    # μ of batch will be
+    #  (1. + 3. + 5.) / 3 = 3
+    #  (2. + 4. + 6.) / 3 = 4
+    #
+    # ∴ update rule with momentum:
+    #  .1 * 3 + 0 = .3
+    #  .1 * 4 + 0 = .4
+    m = BatchNorm(2, track_stats = true)
+    gs = gradient((m,x) -> sum(m(x)), m, x)
+    @test m.μ ≈ reshape([0.3, 0.4], 2, 1)
+
+    # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
+    # 2×1 Array{Float64,2}:
+    #  1.5
+    #  1.5
+    v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
+    @test m.σ² ≈ v
+   
+    x′ = m(x)
+    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
+  end
 
-      y = m(x)
-      @test length(m.μ) == 2
-      @test length(m.σ²) == 2
-      @test y ≈ (x .- reshape(m.μ, 1,2,1)) ./ sqrt.(reshape(m.σ², 1,2,1) .+ 1f-5)   atol=1.0e-5
+  # with activation function
+  let m = trainmode!(BatchNorm(3, sigmoid)), x = reshape(1:6, 1,1,3,2)
+    y = m(x)
+    @test_broken isapprox(y, mean(sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), dims = 1), atol = 1.0e-7)
   end
 
+  let m = BatchNorm(2), x = reshape(Float32.(1:6), 3, 2, 1)
+    y = reshape(permutedims(x, [2, 1, 3]), 2, :)
+    y = permutedims(reshape(m(y), 2, 3, 1), [2, 1, 3])
+    @test m(x) ≈ y
+  end
+
+  let m = BatchNorm(2), x = reshape(Float32.(1:12), 2, 3, 2, 1)
+    y = reshape(permutedims(x, [3, 1, 2, 4]), 2, :)
+    y = permutedims(reshape(m(y), 2, 2, 3, 1), [2, 3, 1, 4])
+    @test m(x) ≈ y
+  end
+
+  let m = BatchNorm(2), x = reshape(Float32.(1:24), 2, 2, 3, 2, 1)
+    y = reshape(permutedims(x, [4, 1, 2, 3, 5]), 2, :)
+    y = permutedims(reshape(m(y), 2, 2, 2, 3, 1), [2, 3, 4, 1, 5])
+    @test m(x) ≈ y
+  end
+
+  # let m = BatchNorm(32), x = randn(Float32, 416, 416, 32, 1);
+  #   m(x)
+  #   @test (@allocated m(x)) <  100_000_000
+  # end
+
+  # @test length(Flux.params(BatchNorm(10))) == 2
+  # @test length(Flux.params(BatchNorm(10, affine=true))) == 2
+  # @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+end
+
+@testset "InstanceNorm" begin
+  # begin tests
+  m = InstanceNorm(2; affine = true, track_stats = true)
+  sizes = (3, 2, 2)
+  x = reshape(1:prod(sizes), sizes)
+
+  # @test length(params(m)) == 2
+  x = Float32.(x)
+  @test m.β == [0, 0]  # initβ(2)
+  @test m.γ == [1, 1]  # initγ(2)
+  y, back = pullback((m,x) -> m(x), m, x)
+
+  #julia> x
+  #[:, :, 1] =
+  # 1.0  4.0
+  # 2.0  5.0
+  # 3.0  6.0
+  #
+  #[:, :, 2] =
+  # 7.0  10.0
+  # 8.0  11.0
+  # 9.0  12.0
+  #
+  # μ will be
+  # (1. + 2. + 3.) / 3 = 2.
+  # (4. + 5. + 6.) / 3 = 5.
+  #
+  # (7. + 8. + 9.) / 3 = 8.
+  # (10. + 11. + 12.) / 3 = 11.
+  #
+  # ∴ update rule with momentum:
+  # (1. - .1) * 0 + .1 * (2. + 8.) / 2 = .5
+  # (1. - .1) * 0 + .1 * (5. + 11.) / 2 = .8
+  N = ndims(x)
+  @test m.μ ≈ [0.5, 0.8]
+  n = prod(size(x,i) for i in 1:N-2)
+  corr = n / (n-1)
+  σ² = var(x, dims = 1:N-2, corrected = false)
+  @test m.σ² ≈ 0.1 * corr * vec(mean(σ², dims = N)) .+ 0.9 * 1
+
+  y = m(x)
+  @test length(m.μ) == 2
+  @test length(m.σ²) == 2
+  @test y ≈ (x .- reshape(m.μ, 1,2,1)) ./ sqrt.(reshape(m.σ², 1,2,1) .+ 1f-5)   atol=1.0e-5
+
   # with activation function
   let m = InstanceNorm(2, sigmoid; affine = true, track_stats = true), sizes = (3, 2, 2),
       x = reshape(collect(1:prod(sizes)), sizes)
@@ -205,12 +205,12 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     
     x = Float64.(x)
     y = m(x)
-    μ = mean(x, dims=1)
-    σ² = var(x, dims=1, corrected=false)
+    μ = mean(x, dims = 1)
+    σ² = var(x, dims = 1, corrected = false)
     @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7
   end
 
-
+  # check trainmode!
   let m = trainmode!(InstanceNorm(2; affine = true)), sizes = (2, 4, 1, 2, 3),
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     y = reshape(permutedims(x, [3, 1, 2, 4, 5]), :, 2, 3)
@@ -233,14 +233,9 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     @test m_inorm(x) == reshape(m_bnorm(reshape(x, (sizes[1:end - 2]..., :, 1))), sizes)
   end
 
-  let m = InstanceNorm(32), x = randn(Float32, 416, 416, 32, 1);
-    m(x)
-    @test (@allocated m(x)) <  100_000_000
-  end
-
-  @test length(Flux.params(InstanceNorm(10))) == 0
-  @test length(Flux.params(InstanceNorm(10, affine = true))) == 2
-  @test length(Flux.params(InstanceNorm(10, affine =false))) == 0
+  # @test length(Flux.params(InstanceNorm(10))) == 0
+  # @test length(Flux.params(InstanceNorm(10, affine = true))) == 2
+  # @test length(Flux.params(InstanceNorm(10, affine =false))) == 0
 end
 
 @testset "LayerNorm" begin

From 30d654273cafe488a92049b3439da0c4c3265a4f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Sat, 13 Mar 2021 02:45:19 +0530
Subject: [PATCH 22/44] dont reshape eagerly

---
 src/layers/normalise.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 11db9895bb..a97e30369b 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -187,9 +187,10 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
   else  # trainmode or testmode without tracked stats
     μ = mean(x; dims = nc.dims)
     σ² = sum((x .- μ) .^ 2; dims = nc.dims) ./ l.chs
-    μ, σ² = track_stats(x, (l.μ, l.σ²), (reshape(μ, size(l.μ)...), 
-                        reshape(σ², size(l.σ²)...)),
+
+    μ, σ² = track_stats(x, (l.μ, l.σ²), (μ,σ²),
                         l.momentum, reduce_dims = nc.dims)
+
     Zygote.ignore() do
       l.μ = reshape(μ, :)
       l.σ² = reshape(σ², :)

From 1d99fd6f0988e17b6ceddd7244703fd97599811e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Mar 2021 02:51:58 +0530
Subject: [PATCH 23/44] use mean instead of channel

---
 src/layers/normalise.jl | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index a97e30369b..7c51fb8eeb 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -186,14 +186,15 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
     σ² = reshape(l.σ², stats_shape)
   else  # trainmode or testmode without tracked stats
     μ = mean(x; dims = nc.dims)
-    σ² = sum((x .- μ) .^ 2; dims = nc.dims) ./ l.chs
+    σ² = mean((x .- μ) .^ 2; dims = nc.dims) # ./ l.chs
 
-    μ, σ² = track_stats(x, (l.μ, l.σ²), (μ,σ²),
+
+    μnew, σ²new = track_stats(x, (l.μ, l.σ²), (μ,σ²),
                         l.momentum, reduce_dims = nc.dims)
 
     Zygote.ignore() do
-      l.μ = reshape(μ, :)
-      l.σ² = reshape(σ², :)
+      l.μ = reshape(μnew, :)
+      l.σ² = reshape(σ²new, :)
     end
   end
   μ, σ²
@@ -219,13 +220,14 @@ function affine(l, x, μ, σ², nc::NormConfig{true})
   affine_shape = getaffine(nc, size(x))
   γ = reshape(l.γ, affine_shape)
   β = reshape(l.β, affine_shape)
-  μ = reshape(μ, affine_shape)
-  σ² = reshape(σ², affine_shape)
   x̂ = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
   l.λ.(γ .* x̂ .+ β)
 end
 
-affine(l, x, μ, σ², nc::NormConfig{false}) = l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
+function affine(l, x, μ, σ², nc::NormConfig{false}) 
+  affine_shape = getaffine(nc, size(x))
+  l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
+end
 
 # function affine(l, x, μ, σ², affine_shape)
 #   res = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)

From e3ae11d11b101e7f1d0774f741dc373fe2aa9521 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Tue, 16 Mar 2021 02:54:14 +0530
Subject: [PATCH 24/44] unbreak a couple tests

---
 test/layers/normalisation.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 06e62b92c1..21579c2ef9 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -93,7 +93,7 @@ end
     @test m.σ² ≈ v
    
     x′ = m(x)
-    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.5), atol = 1.0e-5)
+    @test isapprox(x′[1], (1 .- 0.3) / sqrt(1.3), atol = 1.0e-5)
   end
 
   # with activation function
@@ -220,7 +220,7 @@ end
 
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6),
-      x = reshape(Float32.(collect(1:prod(sizes))), sizes)
+      x = reshape(Float32.(1:prod(sizes)), sizes)
     y = m(x)
     @test size(m.μ) == (sizes[end - 1], )
     @test size(m.σ²) == (sizes[end - 1], )

From c66202e8528ea3f72598c428c3cc3db32cdbae6e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 17 Mar 2021 03:09:43 +0530
Subject: [PATCH 25/44] use non corrected variance

---
 test/layers/normalisation.jl | 118 +++++++++++++++++------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 21579c2ef9..3ac8486076 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,62 +3,62 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-@testset "Dropout" begin
-  x = [1.,2.,3.]
-  @test x == Dropout(0.1)(x)
-  @test x == evalwgrad(Dropout(0), x)
-  @test zero(x) == evalwgrad(Dropout(1), x)
-
-  x = rand(100)
-  m = Dropout(0.9)
-  y = m(x)
-  # By default no dropout is performed outside training
-  # @test count(a -> a == 0, y) > 50
-  testmode!(m, true)
-  y = m(x) # should override istraining
-  @test count(a -> a == 0, y) == 0
-  testmode!(m, false)
-  y = m(x)
-  @test count(a -> a == 0, y) > 50
-
-  x = rand(Float32, 100)
-  m = Chain(Dense(100,100),
-            Dropout(0.9))
-  y = m(x)
-  # by default no dropout is performed outside training
-  # @test count(a -> a == 0, y) > 50
-  testmode!(m, true)
-  y = m(x) # should override istraining
-  @test count(a -> a == 0, y) == 0
-
-  x = rand(100, 50)
-  m = Dropout(0.5, dims = 2)
-  y = m(x)
-  c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-  @test minimum(c) == maximum(c)
-  m = Dropout(0.5, dims = 1)
-  y = m(x)
-  c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
-  @test minimum(c) == maximum(c)
-
-  # issue #1084
-  m = Dropout(0.9)
-  x = rand(100)
-
-  testmode!(m)
-  y = m(x)
-  @test count(a -> a == 0, y) == 0
-  trainmode!(m)
-  y = m(x)
-  @test count(a -> a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active = true)
-  @test count(a -> a == 0, y) > 50
-
-  y = Flux.dropout(x, 0.9, active = false)
-  @test count(a -> a == 0, y) == 0
-end
-
+# @testset "Dropout" begin
+#   x = [1.,2.,3.]
+#   @test x == Dropout(0.1)(x)
+#   @test x == evalwgrad(Dropout(0), x)
+#   @test zero(x) == evalwgrad(Dropout(1), x)
+# 
+#   x = rand(100)
+#   m = Dropout(0.9)
+#   y = m(x)
+#   # By default no dropout is performed outside training
+#   # @test count(a -> a == 0, y) > 50
+#   testmode!(m, true)
+#   y = m(x) # should override istraining
+#   @test count(a -> a == 0, y) == 0
+#   testmode!(m, false)
+#   y = m(x)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   x = rand(Float32, 100)
+#   m = Chain(Dense(100,100),
+#             Dropout(0.9))
+#   y = m(x)
+#   # by default no dropout is performed outside training
+#   # @test count(a -> a == 0, y) > 50
+#   testmode!(m, true)
+#   y = m(x) # should override istraining
+#   @test count(a -> a == 0, y) == 0
+# 
+#   x = rand(100, 50)
+#   m = Dropout(0.5, dims = 2)
+#   y = m(x)
+#   c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+#   @test minimum(c) == maximum(c)
+#   m = Dropout(0.5, dims = 1)
+#   y = m(x)
+#   c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
+#   @test minimum(c) == maximum(c)
+# 
+#   # issue #1084
+#   m = Dropout(0.9)
+#   x = rand(100)
+# 
+#   testmode!(m)
+#   y = m(x)
+#   @test count(a -> a == 0, y) == 0
+#   trainmode!(m)
+#   y = m(x)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   y = Flux.dropout(x, 0.9, active = true)
+#   @test count(a -> a == 0, y) > 50
+# 
+#   y = Flux.dropout(x, 0.9, active = false)
+#   @test count(a -> a == 0, y) == 0
+# end
+# 
 @testset "BatchNorm" begin
   let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
 
@@ -87,9 +87,9 @@ end
 
     # julia> .1 .* var(x, dims = 4, corrected = true) .* (3 / 2).+ .9 .* [1., 1.]
     # 2×1 Array{Float64,2}:
-    #  1.5
-    #  1.5
-    v = mean((0.1 .* var(x, dims = 4, corrected = true)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
+    #  1.3
+    #  1.3
+    v = mean((0.1 .* var(x, dims = 4, corrected = false)) .* (3 / 2) .+ 0.9 .* [1.0, 1.0], dims = 3) |> x -> dropdims(x, dims = (3,4))
     @test m.σ² ≈ v
    
     x′ = m(x)

From d6fac5616d2ef048689835ceb9caf753eae8f3a3 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 17 Mar 2021 03:10:26 +0530
Subject: [PATCH 26/44] typo

---
 test/layers/normalisation.jl | 112 +++++++++++++++++------------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 3ac8486076..344bd5654c 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -3,62 +3,62 @@ using Zygote: pullback
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
 
-# @testset "Dropout" begin
-#   x = [1.,2.,3.]
-#   @test x == Dropout(0.1)(x)
-#   @test x == evalwgrad(Dropout(0), x)
-#   @test zero(x) == evalwgrad(Dropout(1), x)
-# 
-#   x = rand(100)
-#   m = Dropout(0.9)
-#   y = m(x)
-#   # By default no dropout is performed outside training
-#   # @test count(a -> a == 0, y) > 50
-#   testmode!(m, true)
-#   y = m(x) # should override istraining
-#   @test count(a -> a == 0, y) == 0
-#   testmode!(m, false)
-#   y = m(x)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   x = rand(Float32, 100)
-#   m = Chain(Dense(100,100),
-#             Dropout(0.9))
-#   y = m(x)
-#   # by default no dropout is performed outside training
-#   # @test count(a -> a == 0, y) > 50
-#   testmode!(m, true)
-#   y = m(x) # should override istraining
-#   @test count(a -> a == 0, y) == 0
-# 
-#   x = rand(100, 50)
-#   m = Dropout(0.5, dims = 2)
-#   y = m(x)
-#   c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
-#   @test minimum(c) == maximum(c)
-#   m = Dropout(0.5, dims = 1)
-#   y = m(x)
-#   c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
-#   @test minimum(c) == maximum(c)
-# 
-#   # issue #1084
-#   m = Dropout(0.9)
-#   x = rand(100)
-# 
-#   testmode!(m)
-#   y = m(x)
-#   @test count(a -> a == 0, y) == 0
-#   trainmode!(m)
-#   y = m(x)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   y = Flux.dropout(x, 0.9, active = true)
-#   @test count(a -> a == 0, y) > 50
-# 
-#   y = Flux.dropout(x, 0.9, active = false)
-#   @test count(a -> a == 0, y) == 0
-# end
-# 
+@testset "Dropout" begin
+  x = [1.,2.,3.]
+  @test x == Dropout(0.1)(x)
+  @test x == evalwgrad(Dropout(0), x)
+  @test zero(x) == evalwgrad(Dropout(1), x)
+
+  x = rand(100)
+  m = Dropout(0.9)
+  y = m(x)
+  # By default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
+  testmode!(m, true)
+  y = m(x) # should override istraining
+  @test count(a -> a == 0, y) == 0
+  testmode!(m, false)
+  y = m(x)
+  @test count(a -> a == 0, y) > 50
+
+  x = rand(Float32, 100)
+  m = Chain(Dense(100,100),
+            Dropout(0.9))
+  y = m(x)
+  # by default no dropout is performed outside training
+  # @test count(a -> a == 0, y) > 50
+  testmode!(m, true)
+  y = m(x) # should override istraining
+  @test count(a -> a == 0, y) == 0
+
+  x = rand(100, 50)
+  m = Dropout(0.5, dims = 2)
+  y = m(x)
+  c = map(i -> count(a -> a == 0, @view y[i, :]), 1:100)
+  @test minimum(c) == maximum(c)
+  m = Dropout(0.5, dims = 1)
+  y = m(x)
+  c = map(i -> count(a -> a==0, @view y[:, i]), 1:50)
+  @test minimum(c) == maximum(c)
+
+  # issue #1084
+  m = Dropout(0.9)
+  x = rand(100)
+
+  testmode!(m)
+  y = m(x)
+  @test count(a -> a == 0, y) == 0
+  trainmode!(m)
+  y = m(x)
+  @test count(a -> a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active = true)
+  @test count(a -> a == 0, y) > 50
+
+  y = Flux.dropout(x, 0.9, active = false)
+  @test count(a -> a == 0, y) == 0
+end
+
 @testset "BatchNorm" begin
   let m = BatchNorm(2, track_stats = false), x = reshape(1:6, 1,1,2,3)
 

From 16d0b96671bd0fe28612907c1e509d667e2600d3 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 17 Mar 2021 03:12:42 +0530
Subject: [PATCH 27/44] use train time eval

---
 test/layers/normalisation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 344bd5654c..9a434019fc 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -204,7 +204,7 @@ end
       x = reshape(collect(1:prod(sizes)), sizes)
     
     x = Float64.(x)
-    y = m(x)
+    y, back = pullback((m,x) -> m(x), m, x)
     μ = mean(x, dims = 1)
     σ² = var(x, dims = 1, corrected = false)
     @test y ≈ sigmoid.((x .- μ) ./ sqrt.(σ² .+ m.ϵ))   atol=1.0e-7

From e7fe00b043a6fbac4f34227939f66d98f204cd0b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 17 Mar 2021 03:39:12 +0530
Subject: [PATCH 28/44] check for dims in getaffine

---
 src/layers/normalise.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7c51fb8eeb..fda4c898cb 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -168,12 +168,12 @@ end
 
 NormConfig(affine, track_stats, reduce_dims) = NormConfig{affine, track_stats}(reduce_dims)
 
-function getaffine(nc::NormConfig{true}, sz_x)
+function getaffine(nc::NormConfig{true}, sz_x; dims = length(sz_x) - 1)
   n = length(sz_x)
-  ntuple(i -> i == n-1 ? sz_x[n-1] : 1, n)
+  ntuple(i -> i in dims ? sz_x[i] : 1, length(sz_x))
 end
 
-getaffine(nc::NormConfig{false}, args...) = ()
+getaffine(nc::NormConfig{false}, args...; kwargs...) = ()
 
 # For InstanceNorm, GroupNorm, and BatchNorm.
 # Compute the statistics on the slices specified by reduce_dims.
@@ -216,16 +216,16 @@ function track_stats(x::AbstractArray{T,N}, (μprev, σ²prev), (μ, σ²), mtm;
 end
 @nograd track_stats
 
-function affine(l, x, μ, σ², nc::NormConfig{true})
-  affine_shape = getaffine(nc, size(x))
+function affine(l, x::AbstractArray{T,N}, μ, σ², nc::NormConfig{true}; dims = N - 1) where {T,N}
+  affine_shape = getaffine(nc, size(x), dims = dims)
   γ = reshape(l.γ, affine_shape)
   β = reshape(l.β, affine_shape)
   x̂ = (x .- μ) ./ sqrt.(σ² .+ l.ϵ)
   l.λ.(γ .* x̂ .+ β)
 end
 
-function affine(l, x, μ, σ², nc::NormConfig{false}) 
-  affine_shape = getaffine(nc, size(x))
+function affine(l, x, μ, σ², nc::NormConfig{false}; dims = :) 
+  # affine_shape = getaffine(nc, size(x))
   l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
 end
 
@@ -468,10 +468,11 @@ function (gn::GroupNorm)(x)
   sz = size(x)
   N = ndims(x)
   x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
+  n = ndims(x)
   reduce_dims = 1:N-2
   nc = NormConfig(gn.affine, gn.track_stats, reduce_dims)
   μ, σ² = norm_forward(gn, x, nc)
-  res = affine(gn, x, μ, σ², nc)
+  res = affine(gn, x, μ, σ², nc, dims = (n - 1, n - 2))
   return reshape(res, sz)
 end
 

From 9c01dd21e8ed9bd7c8c787e572033f42a487e31f Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 22 Mar 2021 12:30:33 +0530
Subject: [PATCH 29/44] use correct group dims

---
 src/layers/normalise.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index fda4c898cb..ae5970a554 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -188,7 +188,6 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
     μ = mean(x; dims = nc.dims)
     σ² = mean((x .- μ) .^ 2; dims = nc.dims) # ./ l.chs
 
-
     μnew, σ²new = track_stats(x, (l.μ, l.σ²), (μ,σ²),
                         l.momentum, reduce_dims = nc.dims)
 
@@ -207,6 +206,7 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, false}) where
 end
 
 function track_stats(x::AbstractArray{T,N}, (μprev, σ²prev), (μ, σ²), mtm; reduce_dims) where {T,N}
+  reduce_dims = (1,2)
   m = prod(size(x)[collect(reduce_dims)])
   μnew = vec((N in reduce_dims) ? μ : mean(μ, dims = N))
   σ²new = vec((N in reduce_dims) ? σ² : mean(σ², dims = N))
@@ -469,7 +469,7 @@ function (gn::GroupNorm)(x)
   N = ndims(x)
   x = reshape(x, sz[1:N-2]..., sz[N-1] ÷ gn.G, gn.G, sz[N])
   n = ndims(x)
-  reduce_dims = 1:N-2
+  reduce_dims = 1:n-2
   nc = NormConfig(gn.affine, gn.track_stats, reduce_dims)
   μ, σ² = norm_forward(gn, x, nc)
   res = affine(gn, x, μ, σ², nc, dims = (n - 1, n - 2))

From 9abfe0c827b0ef42e3293b2f181ce45130847165 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 22 Mar 2021 12:33:05 +0530
Subject: [PATCH 30/44] typo

---
 src/layers/normalise.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index ae5970a554..2d4ce32e2b 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -206,7 +206,6 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, false}) where
 end
 
 function track_stats(x::AbstractArray{T,N}, (μprev, σ²prev), (μ, σ²), mtm; reduce_dims) where {T,N}
-  reduce_dims = (1,2)
   m = prod(size(x)[collect(reduce_dims)])
   μnew = vec((N in reduce_dims) ? μ : mean(μ, dims = N))
   σ²new = vec((N in reduce_dims) ? σ² : mean(σ², dims = N))

From a621ef69a3009c0bc5f0990af398249174e693e9 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Mon, 22 Mar 2021 12:42:10 +0530
Subject: [PATCH 31/44] use trainmode groupnorm test

---
 test/layers/normalisation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 9a434019fc..2544506e92 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -269,7 +269,7 @@ end
       @test m.β == [0, 0, 0, 0]  # initβ(32)
       @test m.γ == [1, 1, 1, 1]  # initγ(32)
 
-      y = m(x)
+      y, back = pullback((m,x) -> m(x), m, x)
 
       #julia> x
       #[:, :, 1]  =

From e1746053ae78f007d3381905d16f768e36180db3 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 24 Mar 2021 17:21:34 +0530
Subject: [PATCH 32/44] cleanup

---
 src/layers/normalise.jl | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 2d4ce32e2b..9d0d4d73d3 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -158,7 +158,6 @@ end
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm($(l.size)")
   print(io, ", $(l.λ)")
-  print(io, ", affine = $(l.diag)")
   print(io, ")")
 end
 
@@ -168,10 +167,8 @@ end
 
 NormConfig(affine, track_stats, reduce_dims) = NormConfig{affine, track_stats}(reduce_dims)
 
-function getaffine(nc::NormConfig{true}, sz_x; dims = length(sz_x) - 1)
-  n = length(sz_x)
+getaffine(nc::NormConfig{true}, sz_x; dims = length(sz_x) - 1) =
   ntuple(i -> i in dims ? sz_x[i] : 1, length(sz_x))
-end
 
 getaffine(nc::NormConfig{false}, args...; kwargs...) = ()
 
@@ -224,7 +221,6 @@ function affine(l, x::AbstractArray{T,N}, μ, σ², nc::NormConfig{true}; dims =
 end
 
 function affine(l, x, μ, σ², nc::NormConfig{false}; dims = :) 
-  # affine_shape = getaffine(nc, size(x))
   l.λ.((x .- μ) ./ sqrt.(σ² .+ l.ϵ))
 end
 
@@ -316,16 +312,15 @@ testmode!(m::BatchNorm, mode=true) =
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(l.chs)")
   print(io, ", $(l.λ)")
-  print(io,  ", affine = ")
   print(io, ")")
 end
 
 
 """
-    InstanceNorm(channels::Integer, λ=identity;
-                 initβ=zeros, initγ=ones,
-                 affine=false, track_stats=false,
-                 ϵ=1f-5, momentum=0.1f0)
+    InstanceNorm(channels::Integer, λ = identity;
+                 initβ = zeros, initγ = ones,
+                 affine = false, track_stats = false,
+                 ϵ = 1f-5, momentum = 0.1f0)
 
 [Instance Normalization](https://arxiv.org/abs/1607.08022) layer.
 `channels` should be the size of the channel dimension in your data (see below).
@@ -393,7 +388,6 @@ testmode!(m::InstanceNorm, mode=true) =
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(l.chs)")
   print(io, ", $(l.λ)")
-  print(io, ", affine = ")
   print(io, ")")
 end
 
@@ -481,16 +475,5 @@ testmode!(m::GroupNorm, mode = true) =
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(l.chs), $(l.G)")
   print(io, ", $(l.λ)")
-  print(io, ", affine = ")
   print(io, ")")
 end
-
-# """
-#   hasaffine(l)
-# 
-# Return `true` if a normalisation layer has trainable shift and 
-# scale parameters, `false` otherwise.
-# 
-# See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
-# """
-# hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine

From 99901a7bbe593318b4383e66c12a4c87f4aff862 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 24 Mar 2021 17:26:01 +0530
Subject: [PATCH 33/44] use bias and gamma for trainable

---
 src/layers/normalise.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 9d0d4d73d3..9f6820b798 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -295,7 +295,7 @@ function BatchNorm(chs::Int, λ = identity;
 end
 
 @functor BatchNorm
-# trainable(bn::BatchNorm) = hasaffine(bn) ? (bn.β, bn.γ) : ()
+trainable(bn::BatchNorm) = (bn.β, bn.γ)
 
 function (BN::BatchNorm)(x)
   N = ndims(x)::Int
@@ -370,7 +370,7 @@ function InstanceNorm(chs::Int, λ = identity;
 end
 
 @functor InstanceNorm
-# trainable(in::InstanceNorm) = hasaffine(in) ? (in.β, in.γ) : ()
+trainable(in::InstanceNorm) = (in.β, in.γ)
 
 function (l::InstanceNorm)(x)
   @assert ndims(x) > 2
@@ -433,7 +433,7 @@ mutable struct GroupNorm{F,V,N,W}
 end
 
 @functor GroupNorm
-# trainable(gn::GroupNorm) = hasaffine(gn) ? (gn.β, gn.γ) : ()
+trainable(gn::GroupNorm) = (gn.β, gn.γ)
 
 function GroupNorm(chs::Int, G::Int, λ = identity;
               initβ = i -> zeros(Float32, i), 

From 9f481e48d97a45ff50fc5b7cb6ee2f16ff05e2df Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Wed, 24 Mar 2021 23:30:34 +0530
Subject: [PATCH 34/44] trainable

---
 src/layers/normalise.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 9f6820b798..dc49703b73 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -295,7 +295,7 @@ function BatchNorm(chs::Int, λ = identity;
 end
 
 @functor BatchNorm
-trainable(bn::BatchNorm) = (bn.β, bn.γ)
+trainable(bn::BatchNorm) = bn.affine ? (bn.β, bn.γ) : ()
 
 function (BN::BatchNorm)(x)
   N = ndims(x)::Int
@@ -370,7 +370,7 @@ function InstanceNorm(chs::Int, λ = identity;
 end
 
 @functor InstanceNorm
-trainable(in::InstanceNorm) = (in.β, in.γ)
+trainable(in::InstanceNorm) = in.affine ? (in.β, in.γ) : ()
 
 function (l::InstanceNorm)(x)
   @assert ndims(x) > 2
@@ -433,7 +433,7 @@ mutable struct GroupNorm{F,V,N,W}
 end
 
 @functor GroupNorm
-trainable(gn::GroupNorm) = (gn.β, gn.γ)
+trainable(gn::GroupNorm) = gn.affine ? (gn.β, gn.γ) : ()
 
 function GroupNorm(chs::Int, G::Int, λ = identity;
               initβ = i -> zeros(Float32, i), 

From e9d89abc1d87403eeac35cfda40f5df0de3ac488 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacopmuting.com>
Date: Fri, 26 Mar 2021 20:44:35 +0530
Subject: [PATCH 35/44] test fixes

---
 test/cuda/layers.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index ae819348ed..e7b0eb88fb 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -86,9 +86,9 @@ upsample = [x -> Upsample(scale=x)]
 
 @testset "function layers" begin
   x = rand(Float32, 3,3)
-  gpu_gradtest(x -> sum(Flux.normalise(x; dims=1)), x)
-  gpu_gradtest(x -> sum(Flux.normalise(x; dims=2)), x)
-  gpu_gradtest(x -> sum(Flux.normalise(x)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=1)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x; dims=2)), x)
+  gpu_autodiff_test(x -> sum(Flux.normalise(x)), x)
 end
 
 @testset "BatchNorm mix stuff" begin

From 8f3844c1249c935782390e991d3425ded2563435 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Mon, 19 Apr 2021 19:15:25 +0530
Subject: [PATCH 36/44] new constructor

---
 src/layers/normalise.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index dc49703b73..8ba38de8cd 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -289,9 +289,10 @@ function BatchNorm(chs::Int, λ = identity;
   μ = zeros(Float32, chs)
   σ² = ones(Float32, chs)
 
-  return BatchNorm(chs, λ, β, γ,
-            μ, σ², ϵ, momentum,
-            affine, track_stats, nothing)
+  BatchNorm(λ, β, γ,
+            μ, σ², ϵ, momentum, 
+            affine, track_stats, 
+            nothing, chs)
 end
 
 @functor BatchNorm
@@ -364,9 +365,10 @@ function InstanceNorm(chs::Int, λ = identity;
   γ = initγ(chs)
   μ = zeros(Float32, chs)
   σ² = ones(Float32, chs)
-  InstanceNorm(chs, λ, β, γ,
-            μ, σ², ϵ, momentum, 
-            affine, track_stats, nothing)
+  InstanceNorm(λ, β, γ,
+               μ, σ², ϵ, momentum, 
+               affine, track_stats,
+               nothing, chs)
 end
 
 @functor InstanceNorm

From 14a6372ef5a2af6be5a5fa52002247210f4272e6 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Mon, 19 Apr 2021 20:27:24 +0530
Subject: [PATCH 37/44] conflicts

---
 src/layers/normalise.jl | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 1bfc12976b..da36276735 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -289,11 +289,7 @@ function BatchNorm(chs::Int, λ = identity;
   μ = zeros(Float32, chs)
   σ² = ones(Float32, chs)
 
-<<<<<<< HEAD
   BatchNorm(λ, β, γ,
-=======
-  return BatchNorm(λ, β, γ,
->>>>>>> origin/master
             μ, σ², ϵ, momentum, 
             affine, track_stats, 
             nothing, chs)
@@ -362,7 +358,6 @@ end
 function InstanceNorm(chs::Int, λ = identity;
                     initβ = i -> zeros(Float32, i), 
                     initγ = i -> ones(Float32, i), 
-<<<<<<< HEAD
                     affine = true, track_stats = true,
                     ϵ = 1f-5, momentum = 0.1f0)
 
@@ -374,20 +369,6 @@ function InstanceNorm(chs::Int, λ = identity;
                μ, σ², ϵ, momentum, 
                affine, track_stats,
                nothing, chs)
-=======
-                    affine=false, track_stats=false,
-                    ϵ=1f-5, momentum=0.1f0)
-
-  β = affine ? initβ(chs) : nothing
-  γ = affine ? initγ(chs) : nothing
-  μ = track_stats ? zeros(Float32, chs) : nothing
-  σ² = track_stats ? ones(Float32, chs) : nothing
-
-  return InstanceNorm(λ, β, γ,
-            μ, σ², ϵ, momentum, 
-            affine, track_stats,
-            nothing, chs)
->>>>>>> origin/master
 end
 
 @functor InstanceNorm

From d82c3d3d6e9abb45f0758bcf5198eb00d8d0b7b7 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Mon, 19 Apr 2021 20:53:46 +0530
Subject: [PATCH 38/44] conflicts

---
 test/cuda/layers.jl | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index a817ccae56..3e3f7d2aa8 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -107,16 +107,13 @@ gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu =
 gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
 
 upsample = [x -> Upsample(scale=x)]
- gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
- gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
+gpu_gradtest("Upsample 2d", upsample, rand(Float32, 3, 4, 2, 3), (2,2))
+gpu_gradtest("Upsample 1d", upsample, rand(Float32, 3, 4, 2, 3), (2,))
 
-<<<<<<< HEAD
- pixelshuffle = [PixelShuffle]
- gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
- gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
+pixelshuffle = [PixelShuffle]
+gpu_gradtest("PixelShuffle 2d", pixelshuffle, rand(Float32, 3, 4, 18, 3), 3)
+gpu_gradtest("PixelShuffle 1d", pixelshuffle, rand(Float32, 3, 18, 3), 3)
 
-=======
->>>>>>> origin/master
 @testset "function layers" begin
   x = rand(Float32, 3,3)
   gpu_gradtest(x -> sum(Flux.normalise(x; dims=1)), x)

From 19b91b20e8324d688fad5ed3884f6f1892b688bc Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 24 Jun 2021 20:00:26 +0530
Subject: [PATCH 39/44] size fix

---
 test/cuda/layers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 3e3f7d2aa8..a59f30cc50 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -146,7 +146,7 @@ end
 @testset "Extended BatchNorm" begin
   m_cpu = BatchNorm(2)
   m_gpu = m_cpu |> gpu
-  x_cpu = rand(Float32, 3, 2, 2)
+  x_cpu = rand(Float32, 3, 1, 2, 2)
   x_gpu = x_cpu |> gpu
 
   ## In :auto mode, track statistics only in gradient contest

From 0d4605d063ed3d367a3867d0b8a3f04178199851 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 24 Jun 2021 20:26:34 +0530
Subject: [PATCH 40/44] space cleanups + show

---
 src/layers/normalise.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index da36276735..45ad8339bb 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -313,6 +313,7 @@ testmode!(m::BatchNorm, mode=true) =
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(l.chs)")
   print(io, ", $(l.λ)")
+  print(io, ", affine = $(l.affine)")
   print(io, ")")
 end
 
@@ -390,6 +391,7 @@ testmode!(m::InstanceNorm, mode=true) =
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(l.chs)")
   print(io, ", $(l.λ)")
+  print(io, ", affine = $(l.affine)")
   print(io, ")")
 end
 
@@ -438,10 +440,10 @@ end
 trainable(gn::GroupNorm) = gn.affine ? (gn.β, gn.γ) : ()
 
 function GroupNorm(chs::Int, G::Int, λ = identity;
-              initβ = i -> zeros(Float32, i), 
-              initγ = i -> ones(Float32, i), 
-              affine = true, track_stats = false,
-              ϵ = 1f-5, momentum = 0.1f0) 
+                   initβ = i -> zeros(Float32, i), 
+                   initγ = i -> ones(Float32, i), 
+                   affine = true, track_stats = false,
+                   ϵ = 1f-5, momentum = 0.1f0) 
 
   chs % G == 0 || error("The number of groups ($(G)) must divide the number of channels ($chs)")
 
@@ -450,7 +452,7 @@ function GroupNorm(chs::Int, G::Int, λ = identity;
   μ = zeros(Float32, G)
   σ² = ones(Float32, G)
 
-  return GroupNorm(G, λ, 
+  GroupNorm(G, λ, 
             β, γ,
             μ, σ², 
             ϵ, momentum, 
@@ -478,5 +480,6 @@ testmode!(m::GroupNorm, mode = true) =
 function Base.show(io::IO, l::GroupNorm)
   print(io, "GroupNorm($(l.chs), $(l.G)")
   print(io, ", $(l.λ)")
+  print(io, ", affine = $(l.affine)")
   print(io, ")")
 end

From 36084e58b153c84802d76afd153fae1cb6c7d92e Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 24 Jun 2021 20:32:44 +0530
Subject: [PATCH 41/44] add layer norm show methods

---
 src/layers/normalise.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 45ad8339bb..28e361929a 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -158,6 +158,8 @@ end
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm($(l.size)")
   print(io, ", $(l.λ)")
+  af = l.diag == identity ? false : true
+  print(io, ", affine = $(af)")
   print(io, ")")
 end
 

From 3c6f1ce67c122ce80b9c78c8829367fcf17f95d2 Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Thu, 24 Jun 2021 20:34:27 +0530
Subject: [PATCH 42/44] whitespace

---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 28e361929a..373f04588e 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -188,7 +188,7 @@ function norm_forward(l, x::AbstractArray{T,N}, nc::NormConfig{A, true}) where {
     σ² = mean((x .- μ) .^ 2; dims = nc.dims) # ./ l.chs
 
     μnew, σ²new = track_stats(x, (l.μ, l.σ²), (μ,σ²),
-                        l.momentum, reduce_dims = nc.dims)
+                              l.momentum, reduce_dims = nc.dims)
 
     Zygote.ignore() do
       l.μ = reshape(μnew, :)

From 8f6de19c6e59a83ee87646940b521b6f51f9791b Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Fri, 25 Jun 2021 11:49:55 +0530
Subject: [PATCH 43/44] change some tests

---
 test/layers/normalisation.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 2544506e92..b569974892 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -67,7 +67,7 @@ end
     # initial m.σ is 1
     # initial m.μ is 0
 
-    y = m(x)
+    y, _ = pullback((m,x) -> m(x), m, x)
     @test isapprox(y, reshape([-1.22474 0 1.22474; -1.22474 0 1.22474], 1, 1, 2, 3), atol = 1.0e-5)
     # julia> x
     #  2×3 Array{Float64,2}:
@@ -221,7 +221,7 @@ end
   # check that μ, σ², and the output are the correct size for higher rank tensors
   let m = InstanceNorm(2; affine = true, track_stats = true), sizes = (5, 5, 3, 4, 2, 6),
       x = reshape(Float32.(1:prod(sizes)), sizes)
-    y = m(x)
+    y, _ = pullback((m,x) -> m(x), m, x)
     @test size(m.μ) == (sizes[end - 1], )
     @test size(m.σ²) == (sizes[end - 1], )
     @test size(y) == sizes

From c525d4f8309098b1480fdacec6a1bb641878f40d Mon Sep 17 00:00:00 2001
From: Dhairya Gandhi <dhairya@juliacomputing.com>
Date: Tue, 29 Jun 2021 15:20:43 +0530
Subject: [PATCH 44/44] use affine as function

---
 src/layers/normalise.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 373f04588e..33bdd7c1e7 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -119,7 +119,7 @@ testmode!(m::AlphaDropout, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
-    LayerNorm(sz, λ = identity; affine = true, ϵ = 1fe-5)
+    LayerNorm(sz, λ = identity; affine = Diagonal(sz...), ϵ = 1fe-5)
 
 A [normalisation layer](https://arxiv.org/abs/1607.06450) designed to be
 used with recurrent hidden states. 
@@ -130,8 +130,8 @@ The input is normalised along the first `length(sz)` dimensions
 for tuple `sz`, along the first dimension for integer `sz`.
 The input  is expected to have first dimensions' size equal to `sz`. 
 
-If `affine = true` also applies a learnable shift and rescaling
-as in the [`Diagonal`](@ref) layer.
+By default, LayerNorm also applies a learnable shift and rescaling
+as in the [`Diagonal`](@ref) layer. To disable this, pass `affine = identity`.
 
 
 Se also [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`normalise`](@ref).
@@ -143,9 +143,9 @@ struct LayerNorm{F,D,T,S}
   sz::S
 end
 
-function LayerNorm(sz, λ = identity; affine = true, ϵ = 1f-5)
-  diag = affine ? Diagonal(sz...) : identity
-  return LayerNorm(λ, diag, ϵ, sz)
+function LayerNorm(sz, λ = identity; affine = Diagonal(sz...), ϵ = 1f-5)
+  # diag = affine ? Diagonal(sz...) : identity
+  return LayerNorm(λ, affine, ϵ, sz)
 end
 
 @functor LayerNorm