From f49e81e7b366689bc57a65f8817de4353f144270 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 19 Feb 2022 18:49:22 -0500
Subject: [PATCH] Print channel dimensions of `Dense` like those of `Conv`
 (#1658)

* print channel dims of Dense like Conv, and accept as input

* do the same for Bilinear

* fix tests

* fix tests

* docstring

* change a few more

* update

* docs

* rm circular ref

* fixup

* news + fixes
---
 NEWS.md                           |   1 +
 docs/src/gpu.md                   |   8 +--
 docs/src/models/advanced.md       |  29 ++++----
 docs/src/models/basics.md         |  12 ++--
 docs/src/models/overview.md       |  10 +--
 docs/src/models/recurrence.md     |   2 +-
 docs/src/models/regularisation.md |  12 ++--
 docs/src/saving.md                |  14 ++--
 docs/src/training/training.md     |   4 +-
 docs/src/utilities.md             |   2 +-
 src/deprecations.jl               |  10 +++
 src/layers/basic.jl               | 112 +++++++++++++++---------------
 src/layers/normalise.jl           |   4 +-
 src/layers/show.jl                |   2 +-
 src/layers/stateless.jl           |   2 +-
 src/utils.jl                      |  16 ++---
 test/layers/show.jl               |  32 ++++-----
 17 files changed, 142 insertions(+), 130 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 278e006b2d..1b91c7823c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -7,6 +7,7 @@ been removed in favour of MLDatasets.jl.
 * `flatten` is not exported anymore due to clash with Iterators.flatten.
 * Remove Juno.jl progress bar support as it is now obsolete.
 * `Dropout` gained improved compatibility with Int and Complex arrays and is now twice-differentiable.
+* Notation `Dense(2 => 3, σ)` for channels matches `Conv`; the equivalent `Dense(2, 3, σ)` still works.
 * Many utily functions and the `DataLoader` are [now provided by MLUtils.jl](https://github.com/FluxML/Flux.jl/pull/1874).
 * The DataLoader is now compatible with generic dataset types implementing `MLUtils.numobs` and `MLUtils.getobs`.
 * Added [truncated normal initialisation](https://github.com/FluxML/Flux.jl/pull/1877) of weights.
diff --git a/docs/src/gpu.md b/docs/src/gpu.md
index ec0ca43a17..038976fb16 100644
--- a/docs/src/gpu.md
+++ b/docs/src/gpu.md
@@ -39,12 +39,12 @@ Note that we convert both the parameters (`W`, `b`) and the data set (`x`, `y`)
 If you define a structured model, like a `Dense` layer or `Chain`, you just need to convert the internal parameters. Flux provides `fmap`, which allows you to alter all parameters of a model at once.
 
 ```julia
-d = Dense(10, 5, σ)
+d = Dense(10 => 5, σ)
 d = fmap(cu, d)
 d.weight # CuArray
 d(cu(rand(10))) # CuArray output
 
-m = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+m = Chain(Dense(10 => 5, σ), Dense(5 => 2), softmax)
 m = fmap(cu, m)
 d(cu(rand(10)))
 ```
@@ -54,8 +54,8 @@ As a convenience, Flux provides the `gpu` function to convert models and data to
 ```julia
 julia> using Flux, CUDA
 
-julia> m = Dense(10,5) |> gpu
-Dense(10, 5)
+julia> m = Dense(10, 5) |> gpu
+Dense(10 => 5)
 
 julia> x = rand(10) |> gpu
 10-element CuArray{Float32,1}:
diff --git a/docs/src/models/advanced.md b/docs/src/models/advanced.md
index d2e738362c..8757828594 100644
--- a/docs/src/models/advanced.md
+++ b/docs/src/models/advanced.md
@@ -74,10 +74,10 @@ this using the slicing features `Chain` provides:
 
 ```julia
 m = Chain(
-      Dense(784, 64, relu),
-      Dense(64, 64, relu),
-      Dense(32, 10)
-    )
+      Dense(784 => 64, relu),
+      Dense(64 => 64, relu),
+      Dense(32 => 10)
+    );
 
 ps = Flux.params(m[3:end])
 ```
@@ -142,10 +142,11 @@ Lastly, we can test our new layer. Thanks to the proper abstractions in Julia, o
 ```julia
 model = Chain(
               Join(vcat,
-                   Chain(Dense(1, 5),Dense(5, 1)), # branch 1
-                   Dense(1, 2),                    # branch 2
-                   Dense(1, 1)),                   # branch 3
-              Dense(4, 1)
+                   Chain(Dense(1 => 5, relu), Dense(5 => 1)), # branch 1
+                   Dense(1 => 2),                             # branch 2
+                   Dense(1 => 1)                              # branch 3
+                  ),
+              Dense(4 => 1)
              ) |> gpu
 
 xs = map(gpu, (rand(1), rand(1), rand(1)))
@@ -164,11 +165,11 @@ Join(combine, paths...) = Join(combine, paths)
 # use vararg/tuple version of Parallel forward pass
 model = Chain(
               Join(vcat,
-                   Chain(Dense(1, 5),Dense(5, 1)),
-                   Dense(1, 2),
-                   Dense(1, 1)
+                   Chain(Dense(1 => 5, relu), Dense(5 => 1)),
+                   Dense(1 => 2),
+                   Dense(1 => 1)
                   ),
-              Dense(4, 1)
+              Dense(4 => 1)
              ) |> gpu
 
 xs = map(gpu, (rand(1), rand(1), rand(1)))
@@ -201,8 +202,8 @@ Flux.@functor Split
 Now we can test to see that our `Split` does indeed produce multiple outputs.
 ```julia
 model = Chain(
-              Dense(10, 5),
-              Split(Dense(5, 1),Dense(5, 3),Dense(5, 2))
+              Dense(10 => 5),
+              Split(Dense(5 => 1, tanh), Dense(5 => 3, tanh), Dense(5 => 2))
              ) |> gpu
 
 model(gpu(rand(10)))
diff --git a/docs/src/models/basics.md b/docs/src/models/basics.md
index 3f8e57b166..7c4d128970 100644
--- a/docs/src/models/basics.md
+++ b/docs/src/models/basics.md
@@ -158,14 +158,14 @@ a(rand(10)) # => 5-element vector
 
 Congratulations! You just built the `Dense` layer that comes with Flux. Flux has many interesting layers available, but they're all things you could have built yourself very easily.
 
-(There is one small difference with `Dense` – for convenience it also takes an activation function, like `Dense(10, 5, σ)`.)
+(There is one small difference with `Dense` – for convenience it also takes an activation function, like `Dense(10 => 5, σ)`.)
 
 ## Stacking It Up
 
 It's pretty common to write models that look something like:
 
 ```julia
-layer1 = Dense(10, 5, σ)
+layer1 = Dense(10 => 5, σ)
 # ...
 model(x) = layer3(layer2(layer1(x)))
 ```
@@ -175,7 +175,7 @@ For long chains, it might be a bit more intuitive to have a list of layers, like
 ```julia
 using Flux
 
-layers = [Dense(10, 5, σ), Dense(5, 2), softmax]
+layers = [Dense(10 => 5, σ), Dense(5 => 2), softmax]
 
 model(x) = foldl((x, m) -> m(x), layers, init = x)
 
@@ -186,8 +186,8 @@ Handily, this is also provided for in Flux:
 
 ```julia
 model2 = Chain(
-  Dense(10, 5, σ),
-  Dense(5, 2),
+  Dense(10 => 5, σ),
+  Dense(5 => 2),
   softmax)
 
 model2(rand(10)) # => 2-element vector
@@ -198,7 +198,7 @@ This quickly starts to look like a high-level deep learning library; yet you can
 A nice property of this approach is that because "models" are just functions (possibly with trainable parameters), you can also see this as simple function composition.
 
 ```julia
-m = Dense(5, 2) ∘ Dense(10, 5, σ)
+m = Dense(5 => 2) ∘ Dense(10 => 5, σ)
 
 m(rand(10))
 ```
diff --git a/docs/src/models/overview.md b/docs/src/models/overview.md
index 80a242e361..60e2471994 100644
--- a/docs/src/models/overview.md
+++ b/docs/src/models/overview.md
@@ -43,8 +43,8 @@ Normally, your training and test data come from real world observations, but thi
 Now, build a model to make predictions with `1` input and `1` output:
 
 ```julia
-julia> model = Dense(1, 1)
-Dense(1, 1)
+julia> model = Dense(1 => 1)
+Dense(1 => 1)
 
 julia> model.weight
 1×1 Matrix{Float32}:
@@ -58,10 +58,10 @@ julia> model.bias
 Under the hood, a dense layer is a struct with fields `weight` and `bias`. `weight` represents a weights' matrix and `bias` represents a bias vector. There's another way to think about a model. In Flux, *models are conceptually predictive functions*: 
 
 ```julia
-julia> predict = Dense(1, 1)
+julia> predict = Dense(1 => 1)
 ```
 
-`Dense(1, 1)` also implements the function `σ(Wx+b)` where `W` and `b` are the weights and biases. `σ` is an activation function (more on activations later). Our model has one weight and one bias, but typical models will have many more. Think of weights and biases as knobs and levers Flux can use to tune predictions. Activation functions are transformations that tailor models to your needs. 
+`Dense(1 => 1)` also implements the function `σ(Wx+b)` where `W` and `b` are the weights and biases. `σ` is an activation function (more on activations later). Our model has one weight and one bias, but typical models will have many more. Think of weights and biases as knobs and levers Flux can use to tune predictions. Activation functions are transformations that tailor models to your needs. 
 
 This model will already make predictions, though not accurate ones yet:
 
@@ -185,7 +185,7 @@ The predictions are good. Here's how we got there.
 
 First, we gathered real-world data into the variables `x_train`, `y_train`, `x_test`, and `y_test`. The `x_*` data defines inputs, and the `y_*` data defines outputs. The `*_train` data is for training the model, and the `*_test` data is for verifying the model. Our data was based on the function `4x + 2`.
 
-Then, we built a single input, single output predictive model, `predict = Dense(1, 1)`. The initial predictions weren't accurate, because we had not trained the model yet.
+Then, we built a single input, single output predictive model, `predict = Dense(1 => 1)`. The initial predictions weren't accurate, because we had not trained the model yet.
 
 After building the model, we trained it with `train!(loss, parameters, data, opt)`. The loss function is first, followed by the `parameters` holding the weights and biases of the model, the training data, and the `Descent` optimizer provided by Flux. We ran the training step once, and observed that the parameters changed and the loss went down. Then, we ran the `train!` many times to finish the training process.
 
diff --git a/docs/src/models/recurrence.md b/docs/src/models/recurrence.md
index ba5f5ade0a..ee296dc5d9 100644
--- a/docs/src/models/recurrence.md
+++ b/docs/src/models/recurrence.md
@@ -74,7 +74,7 @@ Equivalent to the `RNN` stateful constructor, `LSTM` and `GRU` are also availabl
 Using these tools, we can now build the model shown in the above diagram with: 
 
 ```julia
-m = Chain(RNN(2, 5), Dense(5, 1))
+m = Chain(RNN(2, 5), Dense(5 => 1))
 ```
 In this example, each output has only one component.
 
diff --git a/docs/src/models/regularisation.md b/docs/src/models/regularisation.md
index b1089d5cef..0c975317e4 100644
--- a/docs/src/models/regularisation.md
+++ b/docs/src/models/regularisation.md
@@ -9,7 +9,7 @@ For example, say we have a simple regression.
 ```julia
 using Flux
 using Flux.Losses: logitcrossentropy
-m = Dense(10, 5)
+m = Dense(10 => 5)
 loss(x, y) = logitcrossentropy(m(x), y)
 ```
 
@@ -39,9 +39,9 @@ Here's a larger example with a multi-layer perceptron.
 
 ```julia
 m = Chain(
-  Dense(28^2, 128, relu),
-  Dense(128, 32, relu),
-  Dense(32, 10))
+  Dense(28^2 => 128, relu),
+  Dense(128 => 32, relu),
+  Dense(32 => 10))
 
 sqnorm(x) = sum(abs2, x)
 
@@ -55,8 +55,8 @@ One can also easily add per-layer regularisation via the `activations` function:
 ```julia
 julia> using Flux: activations
 
-julia> c = Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
-Chain(Dense(10, 5, σ), Dense(5, 2), softmax)
+julia> c = Chain(Dense(10 => 5, σ), Dense(5 => 2), softmax)
+Chain(Dense(10 => 5, σ), Dense(5 => 2), softmax)
 
 julia> activations(c, rand(10))
 3-element Array{Any,1}:
diff --git a/docs/src/saving.md b/docs/src/saving.md
index 9b1db909ce..d9db750d1e 100644
--- a/docs/src/saving.md
+++ b/docs/src/saving.md
@@ -11,8 +11,8 @@ julia> using Flux
 
 julia> model = Chain(Dense(10, 5, NNlib.relu), Dense(5, 2), NNlib.softmax)
 Chain(
-  Dense(10, 5, relu),                   # 55 parameters
-  Dense(5, 2),                          # 12 parameters
+  Dense(10 => 5, relu),                 # 55 parameters
+  Dense(5 => 2),                        # 12 parameters
   NNlib.softmax,
 )                   # Total: 4 arrays, 67 parameters, 524 bytes.
 
@@ -32,8 +32,8 @@ julia> @load "mymodel.bson" model
 
 julia> model
 Chain(
-  Dense(10, 5, relu),                   # 55 parameters
-  Dense(5, 2),                          # 12 parameters
+  Dense(10 => 5, relu),                 # 55 parameters
+  Dense(5 => 2),                        # 12 parameters
   NNlib.softmax,
 )                   # Total: 4 arrays, 67 parameters, 524 bytes.
 
@@ -59,7 +59,7 @@ model parameters.
 ```Julia
 julia> using Flux
 
-julia> model = Chain(Dense(10,5,relu),Dense(5,2),softmax)
+julia> model = Chain(Dense(10 => 5,relu),Dense(5 => 2),softmax)
 Chain(Dense(10, 5, NNlib.relu), Dense(5, 2), NNlib.softmax)
 
 julia> weights = Flux.params(model);
@@ -74,7 +74,7 @@ You can easily load parameters back into a model with `Flux.loadparams!`.
 ```julia
 julia> using Flux
 
-julia> model = Chain(Dense(10,5,relu),Dense(5,2),softmax)
+julia> model = Chain(Dense(10 => 5,relu),Dense(5 => 2),softmax)
 Chain(Dense(10, 5, NNlib.relu), Dense(5, 2), NNlib.softmax)
 
 julia> using BSON: @load
@@ -94,7 +94,7 @@ In longer training runs it's a good idea to periodically save your model, so tha
 using Flux: throttle
 using BSON: @save
 
-m = Chain(Dense(10,5,relu),Dense(5,2),softmax)
+m = Chain(Dense(10 => 5, relu), Dense(5 => 2), softmax)
 
 evalcb = throttle(30) do
   # Show loss
diff --git a/docs/src/training/training.md b/docs/src/training/training.md
index 9db2330b65..0f3182681e 100644
--- a/docs/src/training/training.md
+++ b/docs/src/training/training.md
@@ -47,8 +47,8 @@ We can also define an objective in terms of some model:
 
 ```julia
 m = Chain(
-  Dense(784, 32, σ),
-  Dense(32, 10), softmax)
+  Dense(784 => 32, σ),
+  Dense(32 => 10), softmax)
 
 loss(x, y) = Flux.Losses.mse(m(x), y)
 ps = Flux.params(m)
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
index 5878813a39..407ecac530 100644
--- a/docs/src/utilities.md
+++ b/docs/src/utilities.md
@@ -92,7 +92,7 @@ function make_model(width, height, inchannels, nclasses;
 
   # the input dimension to Dense is programatically calculated from
   #  width, height, and nchannels
-  return Chain(conv_layers..., Dense(prod(conv_outsize), nclasses))
+  return Chain(conv_layers..., Dense(prod(conv_outsize) => nclasses))
 end
 ```
 
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 3998f25028..479709eab7 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -16,4 +16,14 @@ ones32(::Type, dims...) = throw(ArgumentError("Flux.ones32 is always Float32, us
 zeros32(::Type, dims...) = throw(ArgumentError("Flux.zeros32 is always Float32, use Base.zeros to specify the element type"))
 
 # v0.13 deprecations
+
 @deprecate frequencies(xs) group_counts(xs)
+
+# Channel notation: Changed to match Conv, but very softly deprecated!
+# Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
+Dense(in::Integer, out::Integer, σ = identity; kw...) =
+  Dense(in => out, σ; kw...)
+Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity; kw...) =
+  Bilinear((in1, in2) => out, σ; kw...)
+Embedding(in::Integer, out::Integer; kw...) = Embedding(in => out; kw...)
+
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 3e22895e82..952ff7d444 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -14,15 +14,15 @@ julia> m = Chain(x -> x^2, x -> x+1);
 julia> m(5) == 26
 true
 
-julia> m = Chain(Dense(10, 5, tanh), Dense(5, 2));
+julia> m = Chain(Dense(10 => 5, tanh), Dense(5 => 2));
 
 julia> x = rand(10, 32);
 
 julia> m(x) == m[2](m[1](x))
 true
 
-julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10, 5, tanh)), 
-                  dec = Dense(5, 2));
+julia> m2 = Chain(enc = Chain(Flux.flatten, Dense(10 => 5, tanh)), 
+                  dec = Dense(5 => 2));
 
 julia> m2(x) == (m2[:dec] ∘ m2[:enc])(x)
 true
@@ -98,10 +98,10 @@ extraChain(::Tuple{}, x) = ()
 
 
 """
-    Dense(in, out, σ=identity; bias=true, init=glorot_uniform)
+    Dense(in => out, σ=identity; bias=true, init=glorot_uniform)
     Dense(W::AbstractMatrix, [bias, σ])
 
-Create a traditional `Dense` layer, whose forward pass is given by:
+Create a traditional fully connected layer, whose forward pass is given by:
 
     y = σ.(W * x .+ bias)
 
@@ -117,8 +117,8 @@ The weight matrix and/or the bias vector (of length `out`) may also be provided
 
 # Examples
 ```jldoctest
-julia> d = Dense(5, 2)
-Dense(5, 2)         # 12 parameters
+julia> d = Dense(5 => 2)
+Dense(5 => 2)       # 12 parameters
 
 julia> d(rand(Float32, 5, 64)) |> size
 (2, 64)
@@ -127,7 +127,7 @@ julia> d(rand(Float32, 5, 1, 1, 64)) |> size  # treated as three batch dimension
 (2, 1, 1, 64)
 
 julia> d1 = Dense(ones(2, 5), false, tanh)  # using provided weight matrix
-Dense(5, 2, tanh; bias=false)  # 10 parameters
+Dense(5 => 2, tanh; bias=false)  # 10 parameters
 
 julia> d1(ones(5))
 2-element Vector{Float64}:
@@ -148,9 +148,8 @@ struct Dense{F, M<:AbstractMatrix, B}
   end
 end
 
-function Dense(in::Integer, out::Integer, σ = identity;
-               init = glorot_uniform, bias=true)
-
+function Dense((in, out)::Pair{<:Integer, <:Integer}, σ = identity;
+               init = glorot_uniform, bias = true)
   Dense(init(out, in), bias, σ)
 end
 
@@ -166,7 +165,7 @@ end
   reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
 
 function Base.show(io::IO, l::Dense)
-  print(io, "Dense(", size(l.weight, 2), ", ", size(l.weight, 1))
+  print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
   l.σ == identity || print(io, ", ", l.σ)
   l.bias == Zeros() && print(io, "; bias=false")
   print(io, ")")
@@ -224,11 +223,11 @@ julia> m([-2 -1 0 1 2])
 1×5 Matrix{Int64}:
  4  1  0  3  6
 
-julia> m3 = Maxout(() -> Dense(5, 7, tanh), 3)
+julia> m3 = Maxout(() -> Dense(5 => 7, tanh), 3)
 Maxout(
-  Dense(5, 7, tanh),                    # 42 parameters
-  Dense(5, 7, tanh),                    # 42 parameters
-  Dense(5, 7, tanh),                    # 42 parameters
+  Dense(5 => 7, tanh),                  # 42 parameters
+  Dense(5 => 7, tanh),                  # 42 parameters
+  Dense(5 => 7, tanh),                  # 42 parameters
 )                   # Total: 6 arrays, 126 parameters, 888 bytes.
 
 julia> Flux.outputsize(m3, (5, 11))
@@ -299,23 +298,25 @@ function Base.show(io::IO, b::SkipConnection)
 end
 
 """
-    Bilinear(in1, in2, out, σ=identity; bias=true, init=glorot_uniform)
+    Bilinear((in1, in2) => out, σ=identity; bias=true, init=glorot_uniform)
     Bilinear(W::AbstractArray, [bias, σ])
 
-Creates a Bilinear layer, which operates on two inputs at the same time.
+Creates a bilinear layer, which operates on two inputs at the same time.
 Its output, given vectors `x` & `y`, is another vector `z` with,
 for all `i ∈ 1:out`:
 
     z[i] = σ(x' * W[i,:,:] * y + bias[i])
 
 If `x` and `y` are matrices, then each column of the output `z = B(x, y)` is of this form,
-with `B` a Bilinear layer.
+with `B` the Bilinear layer.
 
-If `y` is not given, it is taken to be equal to `x`, i.e. `B(x) == B(x, x)`
+If the second input `y` is not given, it is taken to be equal to `x`, i.e. `B(x) == B(x, x)`
 
 The two inputs may also be provided as a tuple, `B((x, y)) == B(x, y)`,
 which is accepted as the input to a `Chain`.
 
+If the two input sizes are the same, `in1 == in2`, then you may write `Bilinear(in => out, σ)`.
+
 The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`.
 By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off
 trainable bias. Either of these may be provided explicitly.
@@ -324,7 +325,8 @@ trainable bias. Either of these may be provided explicitly.
 ```jldoctest
 julia> x, y = randn(Float32, 5, 32), randn(Float32, 5, 32);
 
-julia> B = Flux.Bilinear(5, 5, 7);
+julia> B = Flux.Bilinear((5, 5) => 7)
+Bilinear(5 => 7)    # 182 parameters
 
 julia> B(x) |> size  # interactions based on one input
 (7, 32)
@@ -333,15 +335,15 @@ julia> B(x,y) == B((x,y))  # two inputs, may be given as a tuple
 true
 
 julia> sc = SkipConnection(
-                Chain(Dense(5, 20, tanh), Dense(20, 9, tanh)),
-                Flux.Bilinear(9, 5, 3, bias=false),
+                Chain(Dense(5 => 20, tanh), Dense(20 => 9, tanh)),
+                Flux.Bilinear((9, 5) => 3, bias=false),
             );  # used as the recombinator, with skip as the second input
 
 julia> sc(x) |> size
 (3, 32)
 
 julia> Flux.Bilinear(rand(4,8,16), false, tanh)  # first dim of weight is the output
-Bilinear(8, 16, 4, tanh, bias=false)
+Bilinear((8, 16) => 4, tanh; bias=false)  # 512 parameters
 ```
 """
 struct Bilinear{F,A,B}
@@ -357,10 +359,11 @@ end
 
 @functor Bilinear
 
-function Bilinear(in1::Integer, in2::Integer, out::Integer, σ = identity;
-                  init = glorot_uniform, bias = true)
+function Bilinear(((in1, in2), out)::Pair{<:Tuple, <:Integer}, σ = identity;
+                  bias = true, init = glorot_uniform)
   Bilinear(init(out, in1, in2), bias, σ)
 end
+Bilinear((in12, out)::Pair{<:Integer, <:Integer}, σ = identity; kw...) = Bilinear((in12, in12) => out, σ; kw...)
 
 function (a::Bilinear)(x::AbstractMatrix, y::AbstractMatrix)
   W, b, σ = a.weight, a.bias, a.σ
@@ -385,9 +388,13 @@ end
 (a::Bilinear)(x::NTuple{2, AbstractArray}) = a(x[1], x[2])
 
 function Base.show(io::IO, l::Bilinear)
-  print(io, "Bilinear(", size(l.weight, 2), ", ", size(l.weight, 3), ", ", size(l.weight, 1))
+  if size(l.weight, 2) == size(l.weight, 3)
+    print(io, "Bilinear(", size(l.weight, 2), " => ", size(l.weight, 1))
+  else
+    print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
+  end
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == Flux.Zeros() && print(io, ", bias=false")
+  l.bias == Flux.Zeros() && print(io, "; bias=false")
   print(io, ")")
 end
 
@@ -395,7 +402,7 @@ end
     Parallel(connection, layers...)
     Parallel(connection; name = layer, ...)
 
-Create a `Parallel` layer that passes an input array to each path in
+Create a layer which passes an input array to each path in
 `layers`, before reducing the output with `connection`.
 
 Called with one input `x`, this is equivalent to `connection([l(x) for l in layers]...)`.
@@ -410,9 +417,9 @@ and [`Maxout`](@ref) which reduces by broadcasting `max`.
 # Examples
 
 ```jldoctest
-julia> model = Chain(Dense(3, 5),
-                     Parallel(vcat, Dense(5, 4), Chain(Dense(5, 7), Dense(7, 4))),
-                     Dense(8, 17));
+julia> model = Chain(Dense(3 => 5),
+                     Parallel(vcat, Dense(5 => 4), Chain(Dense(5 => 7), Dense(7 => 4))),
+                     Dense(8 => 17));
 
 julia> model(rand(3)) |> size
 (17,)
@@ -420,8 +427,8 @@ julia> model(rand(3)) |> size
 julia> model2 = Parallel(+; α = Dense(10, 2, tanh), β = Dense(5, 2))
 Parallel(
   +,
-  α = Dense(10, 2, tanh),               # 22 parameters
-  β = Dense(5, 2),                      # 12 parameters
+  α = Dense(10 => 2, tanh),             # 22 parameters
+  β = Dense(5 => 2),                    # 12 parameters
 )                   # Total: 4 arrays, 34 parameters, 392 bytes.
 
 julia> model2(rand(10), rand(5)) |> size
@@ -476,39 +483,33 @@ function Base.show(io::IO, m::Parallel)
 end
 
 """
-    Embedding(in, out; init=randn)
+    Embedding(in => out; init=randn)
 
 A lookup table that stores embeddings of dimension `out` 
-for a vocabulary of size `in`. 
+for a vocabulary of size `in`.
 
-This layers is often used to store word embeddings and retrieve them using indices. 
+This layer is often used to store word embeddings and retrieve them using indices. 
 The input to the layer can be either a vector of indexes
 or the corresponding [onehot encoding](@ref Flux.OneHotArray). 
 
 # Examples
-
-```julia-repl
-julia> using Flux: Embedding
-
+```jldoctest
 julia> vocab_size, embed_size = 1000, 4;
 
-julia> model = Embedding(vocab_size, embed_size)
-Embedding(1000, 4)
+julia> model = Flux.Embedding(vocab_size => embed_size)
+Embedding(1000 => 4)  # 4_000 parameters
 
-julia> vocab_idxs = [1, 722, 53, 220, 3]
+julia> vocab_idxs = [1, 722, 53, 220, 3];
 
-julia> x = OneHotMatrix(vocab_idxs, vocab_size);
+julia> x = Flux.OneHotMatrix(vocab_idxs, vocab_size); summary(x)
+"1000×5 OneHotMatrix(::Vector{Int64}) with eltype Bool"
 
-julia> model(x)
-4×5 Matrix{Float32}:
-  0.91139    0.670462    0.463217   0.670462    0.110932
-  0.247225  -0.0823874   0.698694  -0.0823874   0.945958
- -0.393626  -0.590136   -0.545422  -0.590136    0.77743
- -0.497621   0.87595    -0.870251   0.87595    -0.772696
-```
+julia> model(x) |> summary
+"4×5 Matrix{Float32}"
 
 julia> model(vocab_idxs) == model(x)
 true
+```
 """
 struct Embedding{W}
   weight::W
@@ -516,18 +517,17 @@ end
 
 @functor Embedding
 
-Embedding(in::Integer, out::Integer; init = randn32) = Embedding(init(out, in))
+Embedding((in, out)::Pair{<:Integer, <:Integer}; init = randn32) = Embedding(init(out, in))
   
-
 (m::Embedding)(x::Integer) = m.weight[:, x]
 (m::Embedding)(x::AbstractVector) = NNlib.gather(m.weight, x)
 (m::Embedding)(x::AbstractArray) = reshape(m(vec(x)), :, size(x)...)
 
 function (m::Embedding)(x::Union{OneHotVector{T,L}, OneHotMatrix{T,L}}) where {T,L}
-    size(m.weight, 2) == L || throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
+  size(m.weight, 2) == L || throw(DimensionMismatch("Matrix column must correspond with OneHot size: $(size(m.weight, 2)) != $L"))
   return m(onecold(x))
 end
  
 function Base.show(io::IO, m::Embedding)
-  print(io, "Embedding($(size(m.weight, 2)), $(size(m.weight, 1)))")
+  print(io, "Embedding(", size(m.weight, 2), " => ", size(m.weight, 1), ")")
 end
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 0a18bf3fe1..7553f8b03f 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -265,9 +265,9 @@ Use [`testmode!`](@ref) during inference.
 # Examples
 ```julia
 m = Chain(
-  Dense(28^2, 64),
+  Dense(28^2 => 64),
   BatchNorm(64, relu),
-  Dense(64, 10),
+  Dense(64 => 10),
   BatchNorm(10),
   softmax)
 ```
diff --git a/src/layers/show.jl b/src/layers/show.jl
index a37af36065..d03a253805 100644
--- a/src/layers/show.jl
+++ b/src/layers/show.jl
@@ -55,7 +55,7 @@ _show_children(m::Maxout) = m.layers
 _show_children(p::Parallel) = (p.connection, p.layers...)
 
 for T in [
-    :Conv, :ConvTranspose, :CrossCor, :DepthwiseConv, :Dense,
+    :Conv, :ConvTranspose, :CrossCor, :DepthwiseConv, :Dense, :Bilinear, :Embedding,
     :BatchNorm, :LayerNorm, :InstanceNorm, :GroupNorm,
   ]
   @eval function Base.show(io::IO, m::MIME"text/plain", x::$T)
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
index 1a3a0df5ec..40375bfeae 100644
--- a/src/layers/stateless.jl
+++ b/src/layers/stateless.jl
@@ -13,7 +13,7 @@ julia> rand(3,4,5) |> Flux.flatten |> size
 
 julia> xs = rand(Float32, 10,10,3,7);
 
-julia> m = Chain(Conv((3,3), 3=>4, pad=1), Flux.flatten, Dense(400,33));
+julia> m = Chain(Conv((3,3), 3 => 4, pad=1), Flux.flatten, Dense(400 => 33));
 
 julia> xs |> m[1] |> size
 (10, 10, 4, 7)
diff --git a/src/utils.jl b/src/utils.jl
index 19b020704b..b5edbad5e6 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -577,22 +577,22 @@ julia> m1 = Chain(Dense(28^2, 64), BatchNorm(64, relu));
 julia> m2 = Chain(m1, Dense(64, 10))
 Chain(
   Chain(
-    Dense(784, 64),                     # 50_240 parameters
+    Dense(784 => 64),                   # 50_240 parameters
     BatchNorm(64, relu),                # 128 parameters, plus 128
   ),
-  Dense(64, 10),                        # 650 parameters
+  Dense(64 => 10),                      # 650 parameters
 )         # Total: 6 trainable arrays, 51_018 parameters,
           # plus 2 non-trainable, 128 parameters, summarysize 200.312 KiB.
 
 julia> Flux.modules(m2)
 7-element Vector{Any}:
- Chain(Chain(Dense(784, 64), BatchNorm(64, relu)), Dense(64, 10))  # 51_018 parameters, plus 128 non-trainable
- (Chain(Dense(784, 64), BatchNorm(64, relu)), Dense(64, 10))
- Chain(Dense(784, 64), BatchNorm(64, relu))  # 50_368 parameters, plus 128 non-trainable
- (Dense(784, 64), BatchNorm(64, relu))
- Dense(784, 64)      # 50_240 parameters
+ Chain(Chain(Dense(784 => 64), BatchNorm(64, relu)), Dense(64 => 10))  # 51_018 parameters, plus 128 non-trainable
+ (Chain(Dense(784 => 64), BatchNorm(64, relu)), Dense(64 => 10))
+ Chain(Dense(784 => 64), BatchNorm(64, relu))  # 50_368 parameters, plus 128 non-trainable
+ (Dense(784 => 64), BatchNorm(64, relu))
+ Dense(784 => 64)    # 50_240 parameters
  BatchNorm(64, relu)  # 128 parameters, plus 128 non-trainable
- Dense(64, 10)       # 650 parameters
+ Dense(64 => 10)     # 650 parameters
 
 julia> L2(m) = sum(sum(abs2, l.weight) for l in Flux.modules(m) if l isa Dense)
 L2 (generic function with 1 method)
diff --git a/test/layers/show.jl b/test/layers/show.jl
index c551bad978..9fe74028a6 100644
--- a/test/layers/show.jl
+++ b/test/layers/show.jl
@@ -1,67 +1,67 @@
 
 @testset "layer printing" begin # 2-arg show, defined with layes
 
-  @test repr(Dense(2,3)) == "Dense(2, 3)"
-  @test repr(Chain(Dense(2,3))) == "Chain(Dense(2, 3))"
-  @test repr(Chain(lay=Dense(2,3))) == "Chain(lay = Dense(2, 3))"
+  @test repr(Dense(2,3)) == "Dense(2 => 3)"
+  @test repr(Chain(Dense(2,3))) == "Chain(Dense(2 => 3))"
+  @test repr(Chain(lay=Dense(2,3))) == "Chain(lay = Dense(2 => 3))"
 
 end
 @testset "nested model printing" begin # 3-arg show, defined in show.jl
 
-  # Dense -- has parameter count, but not inside a matrix
+  # Dense -- has parameter count, but not when inside a matrix:
 
   toplevel_dense = repr("text/plain", Dense(2,3))
-  @test occursin("Dense(2, 3)", toplevel_dense)
+  @test occursin("Dense(2 => 3)", toplevel_dense)
   @test occursin("# 9 parameters", toplevel_dense)
 
   @test Meta.isexpr(Meta.parse(toplevel_dense), :call)  # comment is ignored
 
   vector_dense = repr("text/plain", [Dense(2,3), Dense(2,3)])
-  @test occursin("Dense(2, 3)", vector_dense)
+  @test occursin("Dense(2 => 3)", vector_dense)
   @test occursin("# 9 parameters", vector_dense)
 
   matrix_dense = repr("text/plain", fill(Dense(2,3), 3, 3))
-  @test occursin("Dense(2, 3)", matrix_dense)
+  @test occursin("Dense(2 => 3)", matrix_dense)
   @test !occursin("# 9 parameters", matrix_dense)
 
   tuple_dense = repr("text/plain", tuple(Dense(2,3)))
-  @test occursin("Dense(2, 3)", tuple_dense)
+  @test occursin("Dense(2 => 3)", tuple_dense)
   @test !occursin("# 9 parameters", tuple_dense)
 
   # Chain -- gets split over lines at top level only
 
   toplevel_chain = repr("text/plain", Chain(Dense(2,3)))
-  @test occursin("Chain(\n  Dense(2, 3)", toplevel_chain)
+  @test occursin("Chain(\n  Dense(2 => 3)", toplevel_chain)
   @test occursin("# 9 parameters", toplevel_chain)
   @test !occursin("# Total:", toplevel_chain)
 
   vector_chain = repr("text/plain", [Chain(Dense(2,3)), Chain(Dense(2,3))])
-  @test occursin("Chain(Dense(2, 3))", vector_chain)
+  @test occursin("Chain(Dense(2 => 3))", vector_chain)
   @test occursin("# 9 parameters", vector_chain)
   @test !occursin("# Total:", vector_chain)
 
   matrix_chain = repr("text/plain", fill(Chain(Dense(2,3)), 3,3))
-  @test occursin("Chain(Dense(2, 3))", matrix_chain)
+  @test occursin("Chain(Dense(2 => 3))", matrix_chain)
   @test !occursin("# 9 parameters", matrix_chain)
   @test !occursin("# Total:", matrix_chain)
 
-  # ... and only long enough chains get 
+  # ... and only long enough chains get a total at the end:
 
-  longchain = Chain(Dense(2, 3), Dense(3, 4), Dense(4, 5), softmax)
+  longchain = Chain(Dense(2 => 3), Dense(3 => 4), Dense(4 => 5), softmax)
 
   toplevel_longchain = repr("text/plain", longchain)
-  @test occursin("Chain(\n  Dense(2, 3)", toplevel_longchain)
+  @test occursin("Chain(\n  Dense(2 => 3)", toplevel_longchain)
   @test occursin("# 9 parameters", toplevel_longchain)
   @test occursin("# Total: 6 arrays, 50 parameters", toplevel_longchain)
 
   vector_longchain = repr("text/plain", [longchain, longchain]) # pretty ugly in reality
-  @test occursin("Chain(Dense(2, 3)", vector_longchain)
+  @test occursin("Chain(Dense(2 => 3)", vector_longchain)
   @test occursin("# 50 parameters", vector_longchain)
   @test !occursin("# 9 parameters", vector_longchain)
   @test !occursin("# Total:", vector_longchain)
 
   matrix_longchain = repr("text/plain", fill(longchain, 3,3))
-  @test occursin("Chain(Dense(2, 3)", matrix_longchain)
+  @test occursin("Chain(Dense(2 => 3)", matrix_longchain)
   @test !occursin("# 9 parameters", matrix_longchain)
   @test !occursin("# Total:", matrix_longchain)