From fc3eff4141a62a65933e4d5c4bd45b30a3eca913 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 19 Feb 2022 13:07:04 -0500
Subject: [PATCH 1/7] rm Flux.Zeros, take N+1

---
 src/Flux.jl          |   1 -
 src/deprecations.jl  |   8 ++++
 src/layers/basic.jl  |   6 ++-
 src/layers/conv.jl   |  22 +++++-----
 src/utils.jl         |   6 +--
 src/zeros.jl         |  52 ----------------------
 test/cuda/layers.jl  |   4 +-
 test/layers/basic.jl |   2 +-
 test/layers/conv.jl  |   2 +-
 test/optimise.jl     |   2 +-
 test/utils.jl        | 100 ++++++++++---------------------------------
 11 files changed, 54 insertions(+), 151 deletions(-)
 delete mode 100644 src/zeros.jl

diff --git a/src/Flux.jl b/src/Flux.jl
index 2b204567d0..ad7233cfd7 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -36,7 +36,6 @@ using CUDA
 const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
 
 include("utils.jl")
-include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")
 
diff --git a/src/deprecations.jl b/src/deprecations.jl
index e258f41897..c7ca77b6cc 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -19,6 +19,14 @@ zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32,
 
 @deprecate frequencies(xs) group_counts(xs)
 
+struct Zeros
+  function Zeros()
+    Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", :Zeros)
+    false
+  end
+end
+Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
+
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
 Dense(in::Integer, out::Integer, σ = identity; kw...) =
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 952ff7d444..c61374455a 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -167,7 +167,7 @@ end
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == Zeros() && print(io, "; bias=false")
+  l.bias == false && print(io, "; bias=false")
   print(io, ")")
 end
 
@@ -394,7 +394,11 @@ function Base.show(io::IO, l::Bilinear)
     print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
   end
   l.σ == identity || print(io, ", ", l.σ)
+<<<<<<< HEAD
   l.bias == Flux.Zeros() && print(io, "; bias=false")
+=======
+  l.bias === false && print(io, ", bias=false")
+>>>>>>> 1ef2cd377 (rm Flux.Zeros, take N+1)
   print(io, ")")
 end
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 519618e4be..8036780d95 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -6,6 +6,10 @@ _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 
+conv_reshape_bias(c) = c.bias isa AbstractVector ?
+   reshape(c.bias, map(_->1, c.stride)..., :, 1) :
+   c.bias
+
 """
     SamePad()
 
@@ -61,8 +65,8 @@ Then:
 
 Keywords to control initialization of the layer:
 * `init` - Function used to generate initial weights. Defaults to `glorot_uniform`.
-* `bias` - Initial bias is zero by default, this can be disabled entirely by setting it to
-  `false`, or another vector explicitly as `bias = randn(Float32, out)`.
+* `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely
+  by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`.
 
 See also [`ConvTranspose`](@ref), [`DepthwiseConv`](@ref), [`CrossCor`](@ref).
 
@@ -159,10 +163,9 @@ end
 @functor Conv
 
 function (c::Conv)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation, groups = c.groups)
-  σ.(conv(x, c.weight, cdims) .+ b)
+  σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 _channels_in(l ::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
@@ -183,7 +186,7 @@ function _print_conv_opt(io::IO, l)
   if hasproperty(l, :groups)
     (l.groups == 1) || print(io, ", groups=", l.groups)
   end
-  (l.bias isa Zeros) && print(io, ", bias=false")
+  (l.bias === false) && print(io, ", bias=false")
 end
 
 """
@@ -277,10 +280,9 @@ end
 @nograd conv_transpose_dims
 
 function (c::ConvTranspose)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_transpose_dims(c, x)
-  σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -372,10 +374,9 @@ depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
                     init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 
 function (c::DepthwiseConv)(x)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DepthwiseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(depthwiseconv(x, c.weight, cdims) .+ b)
+  σ.(depthwiseconv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::DepthwiseConv)
@@ -453,10 +454,9 @@ function crosscor(x, w, ddims::DenseConvDims)
 end
 
 function (c::CrossCor)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(crosscor(x, c.weight, cdims) .+ b)
+  σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::CrossCor)
diff --git a/src/utils.jl b/src/utils.jl
index b5edbad5e6..5f9a4e98d7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -441,17 +441,17 @@ rand32(dims...) = Base.rand(Float32, dims...)
 randn32(dims...) = Base.randn(Float32, dims...)
 
 """
-    create_bias(weights, bias, length)
+    create_bias(weights, bias, size...)
 
 Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
 
 * `bias == true` creates a zero vector, of the same type as weights.
-* `bias == false` returns `Zeros()`, a special struct which exists only to encode the absence of bias.
+* `bias == false` returns `false` now, which is understood by AD to be non-differentiable.
 * `bias::AbstractArray` uses the array provided, provided it has the correct size and eltype. If the type is wrong, it will be converted.
 """
 function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
-  bias ? fill!(similar(weights, dims...), 0) : Zeros()
+  bias ? fill!(similar(weights, dims...), 0) : false
 end
 function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
   size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
diff --git a/src/zeros.jl b/src/zeros.jl
deleted file mode 100644
index 1281f4c87a..0000000000
--- a/src/zeros.jl
+++ /dev/null
@@ -1,52 +0,0 @@
-import Base: +, -, *,/, reshape, broadcasted
-
-"""
-    Zeros()
-
-Acts as a stand-in for an array of zeros that can be
-used during training which is ignored by the optimisers.
-
-Useful to turn bias off for a forward pass of a layer.
-
-## Examples
-
-```julia-repl
-julia> bias_less_conv = Conv((2,2), 1=>3; bias = false)
-Conv((2, 2), 1=>3)
-
-julia> params(bias_less_conv) |> length
-1
-
-julia> bias_less_conv.bias
-Flux.Zeros()
-```
-"""
-struct Zeros end
-# To allow for things like Dense(10, 2, initb = Zeros)
-Zeros(args...) = Zeros()
-
-Base.reshape(x::Zeros, dims...) = x
-
-+(::Zeros, b::AbstractArray) = b
-+(a::AbstractArray, ::Zeros) = a
-+(a::Zeros, ::Zeros) = a
-
--(::Zeros, b::AbstractArray) = -b
--(a::AbstractArray, ::Zeros) = a
--(a::Zeros, ::Zeros) = a
-
-# Some opportunities to avoid scalar indexing, intermediaries
-# Since it replicates a little of what we expect Base to do,
-# it should be possible to remove in the future, but for now,
-# these help with performance.
-broadcasted(::typeof(+), a::AbstractArray, b::Zeros) = a
-broadcasted(::typeof(+), a::Zeros, b::AbstractArray) = b
-broadcasted(::typeof(-), a::AbstractArray, b::Zeros) = a
-broadcasted(::typeof(-), a::Zeros, b::AbstractArray) = -b
-# Need adjoints for these or else the gradient w.r.t to the non-Zeros arg will be nothing as well
-@adjoint broadcasted(::typeof(*), a::AbstractArray, b::Zeros) = zero(a), _ -> (nothing, zero(a), nothing)
-@adjoint broadcasted(::typeof(*), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))
-@adjoint broadcasted(::typeof(/), a::Zeros, b::AbstractArray) = zero(b), _ -> (nothing, nothing, zero(b))
-
-# Pass-through for layer constructors
-create_bias(weights::AbstractArray, bias::Flux.Zeros, dims::Integer...) = bias
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
index 396e6c0ab5..677e50170d 100644
--- a/test/cuda/layers.jl
+++ b/test/cuda/layers.jl
@@ -155,8 +155,8 @@ end
   end
 end
 
-@testset "Dense with Zeros bias" begin
-  l = Dense(ones(Float32, 4, 3), Flux.Zeros()) |> gpu
+@testset "Dense without bias" begin
+  l = Dense(ones(Float32, 4, 3), false) |> gpu
   ip = zeros(Float32, 3, 7) |> gpu
 
   @test sum(l(ip)) ≈ 0.f0
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index ca8e15a643..5befed57b4 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -175,7 +175,7 @@ import Flux: activations
       @test b1.σ == identity
 
       b2 = Flux.Bilinear(randn(3,4,5), false)
-      @test b2.bias == Flux.Zeros()
+      @test b2.bias === false
 
       b3 = Flux.Bilinear(randn(Float16, 3,4,5), true, tanh)
       @test b3.σ == tanh
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index 9ce1a27aa0..eb7d13be1c 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -273,7 +273,7 @@ end
 
 @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv]
   @test fun(rand(2,3,4)).bias isa Vector{Float64}
-  @test fun(rand(2,3,4,5), false).bias isa Flux.Zeros
+  @test fun(rand(2,3,4,5), false).bias === false
   if fun == Conv
     @test fun(rand(2,3,4,5,6), rand(6)).bias isa Vector{Float64}
     @test_skip fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64}
diff --git a/test/optimise.jl b/test/optimise.jl
index 4c2d70b351..7b20a635db 100644
--- a/test/optimise.jl
+++ b/test/optimise.jl
@@ -15,7 +15,7 @@ using Random
                        Nesterov(), RMSProp(), Momentum()]
     Random.seed!(42)
     w′ = randn(10, 10)
-    b = Flux.Zeros()
+    b = false
     loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
     for t = 1: 10^5
       θ = params([w′, b])
diff --git a/test/utils.jl b/test/utils.jl
index 3f4efe24ba..77d4b81b5f 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -1,8 +1,8 @@
 using Flux
 using Flux: throttle, nfan, glorot_uniform, glorot_normal,
              kaiming_normal, kaiming_uniform, orthogonal, truncated_normal,
-             sparse_init, stack, unstack, Zeros, batch, unbatch,
-             unsqueeze, params
+             sparse_init, stack, unstack, batch, unbatch,
+             unsqueeze, params, loadparams!
 using StatsBase: var, std
 using Statistics, LinearAlgebra
 using Random
@@ -263,88 +263,36 @@ end
   @test eltype(f32(f64(m))[1].weight) == Float32
 end
 
-@testset "Zeros" begin
+@testset "zero bias" begin
   m = Dense(3,2; bias=false)
-  @test f64(m).bias === m.bias === Zeros()
-  @test f32(m).bias === m.bias === Zeros()
+  @test f64(m).bias === m.bias === false
+  @test f32(m).bias === m.bias === false
 
   @testset "Gradients for broadcasted $op with sizes $s" for op in (+,-,*), s in ((1,), (2,3))
     o = ones(s)
     z = zeros(s)
-    Z = Zeros()
 
     @testset "Explicit" begin
       gfun(args...) = gradient((x, y) -> sum(op.(x,y)), args...)
       g = gfun(o, z)
-      @test gfun(o, Z) == (g[1], nothing)
+      @test gfun(o, false) == (g[1], nothing)
 
       g = gfun(z, o)
-      @test gfun(Z, o) == (nothing, g[2])
+      @test gfun(false, o) == (nothing, g[2])
     end
 
     @testset "Implicit" begin
       gfun(args...) = gradient(() -> sum(op.(args...)), params(collect(args)))
       g = gfun(o, z)
 
-      gres = gfun(o, Z)
+      gres = gfun(o, false)
       @test gres[o] == g[o]
-      @test Z ∉ gres.params
+      @test false ∉ gres.params
 
       g = gfun(z, o)
-      gres = gfun(Z, o)
+      gres = gfun(false, o)
       @test gres[o] == g[o]
-      @test Z ∉ gres.params
-    end
-  end
-
-  @testset "Gradients for broadcasted / with sizes $s" for s in ((1,), (2,3))
-    o = ones(s)
-    z = zeros(s)
-    Z = Zeros() # Only defined for 0-dim
-
-    @testset "Explicit" begin
-      gfun(args...) = gradient((x, y) -> sum(x ./ y), args...)
-      g = gfun(z, o)
-      @test gfun(Z, o) == (nothing, g[2])
-    end
-
-    @testset "Implicit" begin
-      gfun(x,y) = gradient(() -> sum(x ./ y), params([x,y]))
-
-      g = gfun(z, o)
-      gres = gfun(Z, o)
-      @test gres[o] == g[o]
-      @test Z ∉ gres.params
-    end
-  end
-
-  @testset "Gradients for $op with sizes $s" for op in (+,-), s in (tuple(), (1,), (2,3))
-    o = ones(s)
-    z = zeros(s)
-    Z = Zeros()
-
-
-    @testset "Explicit" begin
-      gfun(args...) = gradient((x, y) -> sum(op(x,y)), args...)
-
-      g = gfun(o, z)
-      @test gfun(o, Z) == (g[1], nothing)
-
-      g = gfun(z, o)
-      @test gfun(Z, o) == (nothing, g[2])
-    end
-
-    @testset "Implicit" begin
-      gfun(args...) = gradient(() -> sum(op(args...)), params(collect(args)))
-      g = gfun(o, z)
-      gres = gfun(o, Z)
-      @test gres[o] == g[o]
-      @test Z ∉ gres.params
-
-      g = gfun(z, o)
-      gres = gfun(Z, o)
-      @test gres[o] == g[o]
-      @test Z ∉ gres.params
+      @test false ∉ gres.params
     end
   end
 end
@@ -385,7 +333,7 @@ end
     dl(4, 3, bias)
   )
 
-  nobias(n) = Zeros()
+  nobias(n) = false
   testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, dm(bt)))
     @test l1.weight == l2.weight
     @test l1.bias == l2.bias
@@ -393,11 +341,7 @@ end
   end
 
   @testset "loadparams!" begin
-    import Flux: loadparams!
     pars(w, b) = [w, b]
-    import Flux: loadparams!, Zeros
-
-    pars(w, b::Zeros) = [w, Flux.zeros32(size(w,1))]
     pars(l) = pars(l.weight, l.bias)
     pararray(m) = mapreduce(pars, vcat, m)
     weights(m) = mapreduce(l -> [l.weight], vcat, m)
@@ -407,16 +351,16 @@ end
       testdense(m, bt)
     end
 
-    @testset "$b1 to $b2" for (b1, b2, be) in (
-      (Flux.zeros32, Flux.ones32, Flux.ones32),   # Load ones as bias to a model with zeros as bias -> model gets ones as bias
-      (Flux.ones32, nobias, Flux.zeros32), # Load Zeros as bias to a model with ones as bias-> model gets zeros as bias
-      (nobias, Flux.ones32, nobias),     # Load ones as bias to a model with Zeros as bias-> model bias does not change
-    )
-      m1 = dm(b1)
-      m2 = dm(b2)
-      loadparams!(m1, b1 == nobias ? weights(m2) : pararray(m2))
-      testdense(m1, be)
-    end
+    # @testset "$b1 to $b2" for (b1, b2, be) in (
+    #   (Flux.zeros32, Flux.ones32, Flux.ones32),   # Load ones as bias to a model with zeros as bias -> model gets ones as bias
+    #   (Flux.ones32, nobias, Flux.zeros32), # Load Zeros as bias to a model with ones as bias-> model gets zeros as bias
+    #   (nobias, Flux.ones32, nobias),     # Load ones as bias to a model with Zeros as bias-> model bias does not change
+    # )
+    #   m1 = dm(b1)
+    #   m2 = dm(b2)
+    #   loadparams!(m1, b1 == nobias ? weights(m2) : pararray(m2))
+    #   testdense(m1, be)
+    # end
   end
 
   @testset "destructure" begin

From 160244153e8a38e610fc218a4272806b6bf640fe Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 19 Feb 2022 15:17:35 -0500
Subject: [PATCH 2/7] human-readable loadparams tests, same results

---
 test/utils.jl | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/test/utils.jl b/test/utils.jl
index 77d4b81b5f..9b4ceacbb1 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -350,17 +350,6 @@ end
       loadparams!(m, params(m))
       testdense(m, bt)
     end
-
-    # @testset "$b1 to $b2" for (b1, b2, be) in (
-    #   (Flux.zeros32, Flux.ones32, Flux.ones32),   # Load ones as bias to a model with zeros as bias -> model gets ones as bias
-    #   (Flux.ones32, nobias, Flux.zeros32), # Load Zeros as bias to a model with ones as bias-> model gets zeros as bias
-    #   (nobias, Flux.ones32, nobias),     # Load ones as bias to a model with Zeros as bias-> model bias does not change
-    # )
-    #   m1 = dm(b1)
-    #   m2 = dm(b2)
-    #   loadparams!(m1, b1 == nobias ? weights(m2) : pararray(m2))
-    #   testdense(m1, be)
-    # end
   end
 
   @testset "destructure" begin
@@ -386,6 +375,26 @@ end
   end
 end
 
+@testset "loadparams! & absent bias" begin
+  m0 = Dense(2,3; bias=false, init = Flux.ones32)
+  m1 = Dense(2,3; bias = Flux.randn32(3))
+  m2 = Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9])
+
+  Flux.loadparams!(m1, Flux.params(m2))
+  @test m1.bias == 7:9
+  @test sum(m1.weight) == 21
+
+  # load from a model without bias:
+  Flux.loadparams!(m1, Flux.params(m0))
+  @test_broken iszero(m1.bias)  # should ideally recognise the false but Params doesn't store it.
+  @test sum(m1.weight) == 6
+
+  # load into a model without bias:
+  Flux.loadparams!(m0, Flux.params(m2))  #  ignore the parameter which has nowhere to go? Or error?
+  @test iszero(m0.bias)  # obviously unchanged
+  @test sum(m0.weight) == 21
+end
+
 @testset "Train and test mode" begin
   mutable struct DummyLayer
     testing::Bool

From 312cad5d2eef080a32c0c9d419eba3d5922c7fa1 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 19 Feb 2022 19:00:56 -0500
Subject: [PATCH 3/7] fixup

---
 src/layers/basic.jl | 8 ++------
 test/utils.jl       | 6 +++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index c61374455a..5974c34df7 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -301,7 +301,7 @@ end
     Bilinear((in1, in2) => out, σ=identity; bias=true, init=glorot_uniform)
     Bilinear(W::AbstractArray, [bias, σ])
 
-Creates a bilinear layer, which operates on two inputs at the same time.
+Creates a fully connected layer which operates on two inputs.
 Its output, given vectors `x` & `y`, is another vector `z` with,
 for all `i ∈ 1:out`:
 
@@ -394,11 +394,7 @@ function Base.show(io::IO, l::Bilinear)
     print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
   end
   l.σ == identity || print(io, ", ", l.σ)
-<<<<<<< HEAD
-  l.bias == Flux.Zeros() && print(io, "; bias=false")
-=======
-  l.bias === false && print(io, ", bias=false")
->>>>>>> 1ef2cd377 (rm Flux.Zeros, take N+1)
+  l.bias === false && print(io, "; bias=false")
   print(io, ")")
 end
 
diff --git a/test/utils.jl b/test/utils.jl
index 9b4ceacbb1..8e763dbc1d 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -264,7 +264,7 @@ end
 end
 
 @testset "zero bias" begin
-  m = Dense(3,2; bias=false)
+  m = Dense(3 => 2; bias=false)
   @test f64(m).bias === m.bias === false
   @test f32(m).bias === m.bias === false
 
@@ -376,8 +376,8 @@ end
 end
 
 @testset "loadparams! & absent bias" begin
-  m0 = Dense(2,3; bias=false, init = Flux.ones32)
-  m1 = Dense(2,3; bias = Flux.randn32(3))
+  m0 = Dense(2 => 3; bias=false, init = Flux.ones32)
+  m1 = Dense(2 => 3; bias = Flux.randn32(3))
   m2 = Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9])
 
   Flux.loadparams!(m1, Flux.params(m2))

From 875043ce213035d8778f9dc639954fffac081ffa Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sun, 20 Feb 2022 15:44:50 -0500
Subject: [PATCH 4/7] make the words match the code

---
 src/utils.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 5f9a4e98d7..3258a57107 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -446,9 +446,10 @@ randn32(dims...) = Base.randn(Float32, dims...)
 Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
 
-* `bias == true` creates a zero vector, of the same type as weights.
+* `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero.
 * `bias == false` returns `false` now, which is understood by AD to be non-differentiable.
-* `bias::AbstractArray` uses the array provided, provided it has the correct size and eltype. If the type is wrong, it will be converted.
+* `bias::AbstractArray` uses the array provided, provided it has the correct size.
+  It does not at present correct the `eltype` to match that of `weights`.
 """
 function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
   bias ? fill!(similar(weights, dims...), 0) : false

From fed18fde686a60c2e453a675729cb6c702d0c93d Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 21 Feb 2022 09:12:33 -0500
Subject: [PATCH 5/7] upgrade to test Chain, more errors, but same on master

---
 test/utils.jl | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/test/utils.jl b/test/utils.jl
index 8e763dbc1d..f240c56882 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -376,23 +376,23 @@ end
 end
 
 @testset "loadparams! & absent bias" begin
-  m0 = Dense(2 => 3; bias=false, init = Flux.ones32)
-  m1 = Dense(2 => 3; bias = Flux.randn32(3))
-  m2 = Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9])
+  m0 = Chain(Dense(2 => 3; bias=false, init = Flux.ones32), Dense(3 => 1))
+  m1 = Chain(Dense(2 => 3; bias = Flux.randn32(3)), Dense(3 => 1))
+  m2 = Chain(Dense(Float32[1 2; 3 4; 5 6], Float32[7, 8, 9]), Dense(3 => 1))
 
   Flux.loadparams!(m1, Flux.params(m2))
-  @test m1.bias == 7:9
-  @test sum(m1.weight) == 21
-
-  # load from a model without bias:
-  Flux.loadparams!(m1, Flux.params(m0))
-  @test_broken iszero(m1.bias)  # should ideally recognise the false but Params doesn't store it.
-  @test sum(m1.weight) == 6
-
-  # load into a model without bias:
-  Flux.loadparams!(m0, Flux.params(m2))  #  ignore the parameter which has nowhere to go? Or error?
-  @test iszero(m0.bias)  # obviously unchanged
-  @test sum(m0.weight) == 21
+  @test m1[1].bias == 7:9
+  @test sum(m1[1].weight) == 21
+
+  # load from a model without bias -- should ideally recognise the `false` but `Params` doesn't store it
+  @test_broken Flux.loadparams!(m1, Flux.params(m0))
+  @test_broken iszero(m1[1].bias)
+  @test sum(m1[1].weight) == 6  # written before error
+
+  # load into a model without bias -- should it ignore the parameter which has no home, or error?
+  @test_broken Flux.loadparams!(m0, Flux.params(m2))
+  @test iszero(m0[1].bias)  # obviously unchanged
+  @test sum(m0[1].weight) == 21
 end
 
 @testset "Train and test mode" begin

From d559685a369ff28c008041c744351ce0ee4aab85 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 5 Mar 2022 08:17:39 -0500
Subject: [PATCH 6/7] Update src/utils.jl

Co-authored-by: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 src/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.jl b/src/utils.jl
index 3258a57107..ce715a4892 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -447,7 +447,7 @@ Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
 
 * `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero.
-* `bias == false` returns `false` now, which is understood by AD to be non-differentiable.
+* `bias == false` returns `false`, which is understood by AD to be non-differentiable.
 * `bias::AbstractArray` uses the array provided, provided it has the correct size.
   It does not at present correct the `eltype` to match that of `weights`.
 """

From b6e3f93f191cedb015b0471a9d83ef4f7a0ede4a Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 5 Mar 2022 11:38:22 -0500
Subject: [PATCH 7/7] Update src/layers/basic.jl

---
 src/layers/basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 5974c34df7..8aaf4e7df0 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -301,7 +301,7 @@ end
     Bilinear((in1, in2) => out, σ=identity; bias=true, init=glorot_uniform)
     Bilinear(W::AbstractArray, [bias, σ])
 
-Creates a fully connected layer which operates on two inputs.
+Creates a layer which is fully connected between two inputs and the output, and otherwise similar to [`Dense`](@ref).
 Its output, given vectors `x` & `y`, is another vector `z` with,
 for all `i ∈ 1:out`: