diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 000000000..07fcf66f1
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,9 @@
+style = "sciml"
+whitespace_in_kwargs = true
+format_docstrings = true
+always_for_in = true
+join_lines_based_on_source = true
+separate_kwargs_with_semicolon = true
+always_use_return = true
+margin = 92
+indent = 4
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..ae9c21381
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,4 @@
+# .git-blame-ignore-revs
+# Switched to SciML style for code
+d5d28f0ef6e1e253ecf3fdbbec2f511836c8767b
+70d639de532b046980cbea8d17fb1829e04cccfe
diff --git a/Project.toml b/Project.toml
index c010c513d..aeb660d6a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metalhead"
 uuid = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
-version = "0.7.1"
+version = "0.7.2-DEV"
 
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -16,7 +16,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 BSON = "0.3.2"
 Flux = "0.13"
 Functors = "0.2"
-MLUtils = "0.2"
+MLUtils = "0.2.6"
 NNlib = "0.7.34, 0.8"
 julia = "1.6"
 
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index a0fb3785a..e465b6981 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -37,22 +37,23 @@ include("vit-based/vit.jl")
 
 include("pretrain.jl")
 
-export  AlexNet,
-        VGG, VGG11, VGG13, VGG16, VGG19,
-        ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
-        GoogLeNet, Inception3, SqueezeNet,
-        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
-        ResNeXt,
-        MobileNetv1, MobileNetv2, MobileNetv3,
-        MLPMixer, ResMLP, gMLP,
-        ViT,
-        ConvNeXt, ConvMixer
+export AlexNet,
+       VGG, VGG11, VGG13, VGG16, VGG19,
+       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
+       GoogLeNet, Inception3, SqueezeNet,
+       DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
+       ResNeXt,
+       MobileNetv1, MobileNetv2, MobileNetv3,
+       MLPMixer, ResMLP, gMLP,
+       ViT,
+       ConvNeXt, ConvMixer
 
 # use Flux._big_show to pretty print large models
-for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, 
+for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet,
+          :ResNeXt,
           :MobileNetv1, :MobileNetv2, :MobileNetv3,
           :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer)
-  @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
+    @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
 end
 
 end # module
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index ea3962c2a..405272dd2 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -5,26 +5,27 @@ Create an AlexNet model
 ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)).
 
 # Arguments
-- `nclasses`: the number of output classes
+
+  - `nclasses`: the number of output classes
 """
 function alexnet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((5, 5), 64 => 192, relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((3, 3), 192 => 384, relu, pad = (1, 1)),
-                       Conv((3, 3), 384 => 256, relu, pad = (1, 1)),
-                       Conv((3, 3), 256 => 256, relu, pad = (1, 1)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       AdaptiveMeanPool((6,6))),
-                 Chain(MLUtils.flatten,
-                       Dropout(0.5),
-                       Dense(256 * 6 * 6, 4096, relu),
-                       Dropout(0.5),
-                       Dense(4096, 4096, relu),
-                       Dense(4096, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)),
+                         MaxPool((3, 3); stride = (2, 2)),
+                         Conv((5, 5), 64 => 192, relu; pad = (2, 2)),
+                         MaxPool((3, 3); stride = (2, 2)),
+                         Conv((3, 3), 192 => 384, relu; pad = (1, 1)),
+                         Conv((3, 3), 384 => 256, relu; pad = (1, 1)),
+                         Conv((3, 3), 256 => 256, relu; pad = (1, 1)),
+                         MaxPool((3, 3); stride = (2, 2)),
+                         AdaptiveMeanPool((6, 6))),
+                   Chain(MLUtils.flatten,
+                         Dropout(0.5),
+                         Dense(256 * 6 * 6, 4096, relu),
+                         Dropout(0.5),
+                         Dense(4096, 4096, relu),
+                         Dense(4096, nclasses)))
+
+    return layers
 end
 
 """
@@ -34,21 +35,22 @@ Create a `AlexNet`.
 See also [`alexnet`](#).
 
 !!! warning
+    
     `AlexNet` does not currently support pretrained weights.
 
 # Arguments
-- `pretrain`: set to `true` to load pre-trained weights for ImageNet
-- `nclasses`: the number of output classes
+
+  - `pretrain`: set to `true` to load pre-trained weights for ImageNet
+  - `nclasses`: the number of output classes
 """
 struct AlexNet
-  layers
+    layers::Any
 end
 
 function AlexNet(; pretrain = false, nclasses = 1000)
-  layers = alexnet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "AlexNet")
-
-  AlexNet(layers)
+    layers = alexnet(; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "AlexNet")
+    return AlexNet(layers)
 end
 
 @functor AlexNet
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index 01a6e61be..e19acb2e7 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -6,30 +6,35 @@ Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
 
 # Arguments
-- `planes`: number of planes in the output of each block
-- `depth`: number of layers
-- `inchannels`: number of channels in the input
-- `kernel_size`: kernel size of the convolutional layers
-- `patch_size`: size of the patches
-- `activation`: activation function used after the convolutional layers
-- `nclasses`: number of classes in the output
+
+  - `planes`: number of planes in the output of each block
+  - `depth`: number of layers
+  - `inchannels`: number of channels in the input
+  - `kernel_size`: kernel size of the convolutional layers
+  - `patch_size`: size of the patches
+  - `activation`: activation function used after the convolutional layers
+  - `nclasses`: number of classes in the output
 """
 function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
                    patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
-  stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1])
-  blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
-                                               preact = true, groups = planes, pad = SamePad())), +),
-                  conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth]
-  head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-  return Chain(Chain(stem..., Chain(blocks)), head)
+    stem = conv_bn(patch_size, inchannels, planes, activation; preact = true,
+                   stride = patch_size[1])
+    blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
+                                                 preact = true, groups = planes,
+                                                 pad = SamePad())), +),
+                    conv_bn((1, 1), planes, planes, activation; preact = true)...)
+              for _ in 1:depth]
+    head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
+    return Chain(Chain(stem..., Chain(blocks)), head)
 end
 
 convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)),
+                                      :patch_size => (7, 7)),
                         :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7),
-                                        :patch_size => (7, 7)),
-                        :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)))
+                                       :patch_size => (7, 7)),
+                        :large => Dict(:planes => 1024, :depth => 20,
+                                       :kernel_size => (9, 9),
+                                       :patch_size => (7, 7)))
 
 """
     ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
@@ -38,22 +43,24 @@ Creates a ConvMixer model.
 ([reference](https://arxiv.org/abs/2201.09792))
 
 # Arguments
-- `mode`: the mode of the model, either `:base`, `:small` or `:large`
-- `inchannels`: number of channels in the input
-- `activation`: activation function used after the convolutional layers
-- `nclasses`: number of classes in the output
+
+  - `mode`: the mode of the model, either `:base`, `:small` or `:large`
+  - `inchannels`: number of channels in the input
+  - `activation`: activation function used after the convolutional layers
+  - `nclasses`: number of classes in the output
 """
 struct ConvMixer
-  layers
+    layers::Any
 end
 
 function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
-  planes = convmixer_config[mode][:planes]
-  depth = convmixer_config[mode][:depth]
-  kernel_size = convmixer_config[mode][:kernel_size]
-  patch_size = convmixer_config[mode][:patch_size]
-  layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses)
-  return ConvMixer(layers)
+    planes = convmixer_config[mode][:planes]
+    depth = convmixer_config[mode][:depth]
+    kernel_size = convmixer_config[mode][:kernel_size]
+    patch_size = convmixer_config[mode][:patch_size]
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
+                       nclasses)
+    return ConvMixer(layers)
 end
 
 @functor ConvMixer
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 1621803bf..31e2d3ac0 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -5,19 +5,20 @@ Creates a single block of ConvNeXt.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments:
-- `planes`: number of input channels.
-- `drop_path_rate`: Stochastic depth rate.
-- `λ`: Init value for LayerScale
+
+  - `planes`: number of input channels.
+  - `drop_path_rate`: Stochastic depth rate.
+  - `λ`: Init value for LayerScale
 """
-function convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
-  layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
-                                swapdims((3, 1, 2, 4)),
-                                LayerNorm(planes; ϵ = 1f-6),
-                                mlp_block(planes, 4 * planes),
-                                LayerScale(planes, λ),
-                                swapdims((2, 3, 1, 4)),
-                                DropPath(drop_path_rate)), +)
-  return layers
+function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+    layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
+                                  swapdims((3, 1, 2, 4)),
+                                  LayerNorm(planes; ϵ = 1.0f-6),
+                                  mlp_block(planes, 4 * planes),
+                                  LayerScale(planes, λ),
+                                  swapdims((2, 3, 1, 4)),
+                                  DropPath(drop_path_rate)), +)
+    return layers
 end
 
 """
@@ -27,52 +28,59 @@ Creates the layers for a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments:
-- `inchannels`: number of input channels.
-- `depths`: list with configuration for depth of each block
-- `planes`: list with configuration for number of output channels in each block
-- `drop_path_rate`: Stochastic depth rate.
-- `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
-- `nclasses`: number of output classes
+
+  - `inchannels`: number of input channels.
+  - `depths`: list with configuration for depth of each block
+  - `planes`: list with configuration for number of output channels in each block
+  - `drop_path_rate`: Stochastic depth rate.
+  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
+  - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
-  @assert length(depths) == length(planes) "`planes` should have exactly one value for each block"
-
-  downsample_layers = []
-  stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
-               ChannelLayerNorm(planes[1]; ϵ = 1f-6))
-  push!(downsample_layers, stem)
-  for m in 1:length(depths) - 1
-    downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6),
-                              Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
-    push!(downsample_layers, downsample_layer)
-  end
-
-  stages = []
-  dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths))
-  cur = 0
-  for i in 1:length(depths)
-    push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
-    cur += depths[i]
-  end
-
-  backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
-  head = Chain(GlobalMeanPool(),
-               MLUtils.flatten,
-               LayerNorm(planes[end]),
-               Dense(planes[end], nclasses))
-
-  return Chain(Chain(backbone), head)
+function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
+                  nclasses = 1000)
+    @assert length(depths)==length(planes) "`planes` should have exactly one value for each block"
+
+    downsample_layers = []
+    stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
+                 ChannelLayerNorm(planes[1]; ϵ = 1.0f-6))
+    push!(downsample_layers, stem)
+    for m in 1:(length(depths) - 1)
+        downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6),
+                                 Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
+        push!(downsample_layers, downsample_layer)
+    end
+
+    stages = []
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths))
+    cur = 0
+    for i in 1:length(depths)
+        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        cur += depths[i]
+    end
+
+    backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
+    head = Chain(GlobalMeanPool(),
+                 MLUtils.flatten,
+                 LayerNorm(planes[end]),
+                 Dense(planes[end], nclasses))
+
+    return Chain(Chain(backbone), head)
 end
 
 # Configurations for ConvNeXt models
-convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]),
-                        :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]),
-                        :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]),
-                        :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]),
-                        :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048]))
+convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3],
+                                      :planes => [96, 192, 384, 768]),
+                        :small => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [96, 192, 384, 768]),
+                        :base => Dict(:depths => [3, 3, 27, 3],
+                                      :planes => [128, 256, 512, 1024]),
+                        :large => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [192, 384, 768, 1536]),
+                        :xlarge => Dict(:depths => [3, 3, 27, 3],
+                                        :planes => [256, 512, 1024, 2048]))
 
 struct ConvNeXt
-  layers
+    layers::Any
 end
 
 """
@@ -82,20 +90,21 @@ Creates a ConvNeXt model.
 ([reference](https://arxiv.org/abs/2201.03545))
 
 # Arguments:
-- `inchannels`: number of input channels.
-- `drop_path_rate`: Stochastic depth rate.
-- `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
-- `nclasses`: number of output classes
+
+  - `inchannels`: number of input channels.
+  - `drop_path_rate`: Stochastic depth rate.
+  - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
+  - `nclasses`: number of output classes
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, 
+function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
                   nclasses = 1000)
-  @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
-  depths = convnext_configs[mode][:depths]
-  planes = convnext_configs[mode][:planes]
-  layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
-  return ConvNeXt(layers)
+    @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
+    depths = convnext_configs[mode][:depths]
+    planes = convnext_configs[mode][:planes]
+    layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
+    return ConvNeXt(layers)
 end
 
 (m::ConvNeXt)(x) = m.layers(x)
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index bda7a321d..5384161fd 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -5,16 +5,18 @@ Create a Densenet bottleneck layer
 ([reference](https://arxiv.org/abs/1608.06993)).
 
 # Arguments
-- `inplanes`: number of input feature maps
-- `outplanes`: number of output feature maps on bottleneck branch
-               (and scaling factor for inner feature maps; see ref)
+
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps on bottleneck branch
+    (and scaling factor for inner feature maps; see ref)
 """
 function dense_bottleneck(inplanes, outplanes)
-  inner_channels = 4 * outplanes
-  m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
-            conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...)
+    inner_channels = 4 * outplanes
+    m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
+              conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false,
+                      rev = true)...)
 
-  SkipConnection(m, cat_channels)
+    return SkipConnection(m, cat_channels)
 end
 
 """
@@ -24,11 +26,14 @@ Create a DenseNet transition sequence
 ([reference](https://arxiv.org/abs/1608.06993)).
 
 # Arguments
-- `inplanes`: number of input feature maps
-- `outplanes`: number of output feature maps
+
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
 """
-transition(inplanes, outplanes) =
-  Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2)))
+function transition(inplanes, outplanes)
+    return Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)...,
+                 MeanPool((2, 2)))
+end
 
 """
     dense_block(inplanes, growth_rates)
@@ -38,12 +43,16 @@ the number of output feature maps by `growth_rates` with each block
 ([reference](https://arxiv.org/abs/1608.06993)).
 
 # Arguments
-- `inplanes`: number of input feature maps to the full sequence
-- `growth_rates`: the growth (additive) rates of output feature maps
-                  after each block (a vector of `k`s from the ref)
+
+  - `inplanes`: number of input feature maps to the full sequence
+  - `growth_rates`: the growth (additive) rates of output feature maps
+    after each block (a vector of `k`s from the ref)
 """
-dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o)
-  for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)]
+function dense_block(inplanes, growth_rates)
+    return [dense_bottleneck(i, o)
+            for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]),
+                              growth_rates)]
+end
 
 """
     densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
@@ -52,31 +61,32 @@ Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
 
 # Arguments
-- `inplanes`: the number of input feature maps to the first dense block
-- `growth_rates`: the growth rates of output feature maps within each
-                  [`dense_block`](#) (a vector of vectors)
-- `reduction`: the factor by which the number of feature maps is scaled across each transition
-- `nclasses`: the number of output classes
+
+  - `inplanes`: the number of input feature maps to the first dense block
+  - `growth_rates`: the growth rates of output feature maps within each
+    [`dense_block`](#) (a vector of vectors)
+  - `reduction`: the factor by which the number of feature maps is scaled across each transition
+  - `nclasses`: the number of output classes
 """
 function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
-  push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
-
-  outplanes = 0
-  for (i, rates) in enumerate(growth_rates)
-    outplanes = inplanes + sum(rates)
-    append!(layers, dense_block(inplanes, rates))
-    (i != length(growth_rates)) &&
-      push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
-    inplanes = floor(Int, outplanes * reduction)
-  end
-  push!(layers, BatchNorm(outplanes, relu))
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)),
-                     MLUtils.flatten,
-                     Dense(outplanes, nclasses)))
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    push!(layers, MaxPool((3, 3); stride = 2, pad = (1, 1)))
+
+    outplanes = 0
+    for (i, rates) in enumerate(growth_rates)
+        outplanes = inplanes + sum(rates)
+        append!(layers, dense_block(inplanes, rates))
+        (i != length(growth_rates)) &&
+            push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
+        inplanes = floor(Int, outplanes * reduction)
+    end
+    push!(layers, BatchNorm(outplanes, relu))
+
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)),
+                       MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 """
@@ -86,14 +96,16 @@ Create a DenseNet model
 ([reference](https://arxiv.org/abs/1608.06993)).
 
 # Arguments
-- `nblocks`: number of dense blocks between transitions
-- `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the ref)
-- `reduction`: the factor by which the number of feature maps is scaled across each transition
-- `nclasses`: the number of output classes
+
+  - `nblocks`: number of dense blocks between transitions
+  - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the ref)
+  - `reduction`: the factor by which the number of feature maps is scaled across each transition
+  - `nclasses`: the number of output classes
 """
-densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) =
-  densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-           reduction = reduction, nclasses = nclasses)
+function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
+                    reduction = reduction, nclasses = nclasses)
+end
 
 """
     DenseNet(nblocks::NTuple{N, <:Integer};
@@ -104,22 +116,23 @@ Create a DenseNet model
 See also [`densenet`](#).
 
 # Arguments
-- `nblocks`: number of dense blocks between transitions
-- `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the paper)
-- `reduction`: the factor by which the number of feature maps is scaled across each transition
-- `nclasses`: the number of output classes
+
+  - `nblocks`: number of dense blocks between transitions
+  - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the paper)
+  - `reduction`: the factor by which the number of feature maps is scaled across each transition
+  - `nclasses`: the number of output classes
 """
 struct DenseNet
-  layers
+    layers::Any
 end
 
 function DenseNet(nblocks::NTuple{N, <:Integer};
                   growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-  layers = densenet(nblocks; growth_rate = growth_rate,
-                             reduction = reduction,
-                             nclasses = nclasses)
+    layers = densenet(nblocks; growth_rate = growth_rate,
+                      reduction = reduction,
+                      nclasses = nclasses)
 
-  DenseNet(layers)
+    return DenseNet(layers)
 end
 
 @functor DenseNet
@@ -143,16 +156,17 @@ Create a DenseNet model with specified configuration. Currently supported values
 Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 !!! warning
+    
     `DenseNet` does not currently support pretrained weights.
 
 See also [`Metalhead.densenet`](#).
 """
 function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
-  @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
-  model = DenseNet(densenet_config[config]; nclasses = nclasses)
+    @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
+    model = DenseNet(densenet_config[config]; nclasses = nclasses)
 
-  pretrain && loadpretrain!(model, string("DenseNet", config))
-  return model
+    pretrain && loadpretrain!(model, string("DenseNet", config))
+    return model
 end
 
 # deprecations
diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl
index bc42a052f..318463494 100644
--- a/src/convnets/googlenet.jl
+++ b/src/convnets/googlenet.jl
@@ -5,13 +5,14 @@ Create an inception module for use in GoogLeNet
 ([reference](https://arxiv.org/abs/1409.4842v1)).
 
 # Arguments
-- `inplanes`: the number of input feature maps
-- `out_1x1`: the number of output feature maps for the 1x1 convolution (branch 1)
-- `red_3x3`: the number of output feature maps for the 3x3 reduction convolution (branch 2)
-- `out_3x3`: the number of output feature maps for the 3x3 convolution (branch 2)
-- `red_5x5`: the number of output feature maps for the 5x5 reduction convolution (branch 3)
-- `out_5x5`: the number of output feature maps for the 5x5 convolution (branch 3)
-- `pool_proj`: the number of output feature maps for the pooling projection (branch 4)
+
+  - `inplanes`: the number of input feature maps
+  - `out_1x1`: the number of output feature maps for the 1x1 convolution (branch 1)
+  - `red_3x3`: the number of output feature maps for the 3x3 reduction convolution (branch 2)
+  - `out_3x3`: the number of output feature maps for the 3x3 convolution (branch 2)
+  - `red_5x5`: the number of output feature maps for the 5x5 reduction convolution (branch 3)
+  - `out_5x5`: the number of output feature maps for the 5x5 convolution (branch 3)
+  - `pool_proj`: the number of output feature maps for the pooling projection (branch 4)
 """
 function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj)
     branch1 = Chain(Conv((1, 1), inplanes => out_1x1))
@@ -20,9 +21,8 @@ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5,
                     Conv((3, 3), red_3x3 => out_3x3; pad = 1))
 
     branch3 = Chain(Conv((1, 1), inplanes => red_5x5),
-                    Conv((5, 5), red_5x5 => out_5x5; pad = 2)) 
-
-    branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1),
+                    Conv((5, 5), red_5x5 => out_5x5; pad = 2))
+    branch4 = Chain(MaxPool((3, 3); stride = 1, pad = 1),
                     Conv((1, 1), inplanes => pool_proj))
 
     return Parallel(cat_channels,
@@ -36,31 +36,31 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
 
 # Arguments
-- `nclasses`: the number of output classes
+
+  - `nclasses`: the number of output classes
 """
 function googlenet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       Conv((1, 1), 64 => 64),
-                       Conv((3, 3), 64 => 192; pad = 1),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(192, 64, 96, 128, 16, 32, 32),
-                       _inceptionblock(256, 128, 128, 192, 32, 96, 64),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(480, 192, 96, 208, 16, 48, 64),
-                       _inceptionblock(512, 160, 112, 224, 24, 64, 64),
-                       _inceptionblock(512, 128, 128, 256, 24, 64, 64),
-                       _inceptionblock(512, 112, 144, 288, 32, 64, 64),
-                       _inceptionblock(528, 256, 160, 320, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(832, 256, 160, 320, 32, 128, 128),
-                       _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
-                 Chain(AdaptiveMeanPool((1, 1)),
-                       MLUtils.flatten,
-                       Dropout(0.4),
-                       Dense(1024, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
+                         MaxPool((3, 3); stride = 2, pad = 1),
+                         Conv((1, 1), 64 => 64),
+                         Conv((3, 3), 64 => 192; pad = 1),
+                         MaxPool((3, 3); stride = 2, pad = 1),
+                         _inceptionblock(192, 64, 96, 128, 16, 32, 32),
+                         _inceptionblock(256, 128, 128, 192, 32, 96, 64),
+                         MaxPool((3, 3); stride = 2, pad = 1),
+                         _inceptionblock(480, 192, 96, 208, 16, 48, 64),
+                         _inceptionblock(512, 160, 112, 224, 24, 64, 64),
+                         _inceptionblock(512, 128, 128, 256, 24, 64, 64),
+                         _inceptionblock(512, 112, 144, 288, 32, 64, 64),
+                         _inceptionblock(528, 256, 160, 320, 32, 128, 128),
+                         MaxPool((3, 3); stride = 2, pad = 1),
+                         _inceptionblock(832, 256, 160, 320, 32, 128, 128),
+                         _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
+                   Chain(AdaptiveMeanPool((1, 1)),
+                         MLUtils.flatten,
+                         Dropout(0.4),
+                         Dense(1024, nclasses)))
+    return layers
 end
 
 """
@@ -70,23 +70,24 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 ([reference](https://arxiv.org/abs/1409.4842v1)).
 
 # Arguments
-- `pretrain`: set to `true` to load the model with pre-trained weights for ImageNet
-- `nclasses`: the number of output classes
+
+  - `pretrain`: set to `true` to load the model with pre-trained weights for ImageNet
+  - `nclasses`: the number of output classes
 
 !!! warning
+    
     `GoogLeNet` does not currently support pretrained weights.
 
 See also [`googlenet`](#).
 """
 struct GoogLeNet
-  layers
+    layers::Any
 end
 
 function GoogLeNet(; pretrain = false, nclasses = 1000)
-  layers = googlenet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "GoogLeNet")
-
-  GoogLeNet(layers)
+    layers = googlenet(; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "GoogLeNet")
+    return GoogLeNet(layers)
 end
 
 @functor GoogLeNet
diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl
index ef8ab81ef..e81644599 100644
--- a/src/convnets/inception.jl
+++ b/src/convnets/inception.jl
@@ -5,24 +5,21 @@ Create an Inception-v3 style-A module
 (ref: Fig. 5 in [paper](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
-- `inplanes`: number of input feature maps
-- `pool_proj`: the number of output feature maps for the pooling projection
+
+  - `inplanes`: number of input feature maps
+  - `pool_proj`: the number of output feature maps for the pooling projection
 """
 function inception_a(inplanes, pool_proj)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
-
-  branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
-                    conv_bn((5, 5), 48, 64; pad = 2)...)
-
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                    conv_bn((3, 3), 64, 96; pad = 1)...,
-                    conv_bn((3, 3), 96, 96; pad = 1)...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, pool_proj)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch5x5, branch3x3, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
+    branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
+                      conv_bn((5, 5), 48, 64; pad = 2)...)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                      conv_bn((3, 3), 64, 96; pad = 1)...,
+                      conv_bn((3, 3), 96, 96; pad = 1)...)
+    branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, pool_proj)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch5x5, branch3x3, branch_pool)
 end
 
 """
@@ -32,19 +29,17 @@ Create an Inception-v3 style-B module
 (ref: Fig. 10 in [paper](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
-- `inplanes`: number of input feature maps
+
+  - `inplanes`: number of input feature maps
 """
 function inception_b(inplanes)
-  branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                      conv_bn((3, 3), 64, 96; pad = 1)...,
-                      conv_bn((3, 3), 96, 96; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride = 2)
-
-  return Parallel(cat_channels,
-                  branch3x3_1, branch3x3_2, branch_pool)
+    branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                        conv_bn((3, 3), 64, 96; pad = 1)...,
+                        conv_bn((3, 3), 96, 96; stride = 2)...)
+    branch_pool = MaxPool((3, 3); stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3_1, branch3x3_2, branch_pool)
 end
 
 """
@@ -54,28 +49,25 @@ Create an Inception-v3 style-C module
 (ref: Fig. 6 in [paper](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
-- `inplanes`: number of input feature maps
-- `inner_planes`: the number of output feature maps within each branch
-- `n`: the "grid size" (kernel size) for the convolution layers
+
+  - `inplanes`: number of input feature maps
+  - `inner_planes`: the number of output feature maps within each branch
+  - `n`: the "grid size" (kernel size) for the convolution layers
 """
 function inception_c(inplanes, inner_planes, n = 7)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
-
-  branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
-
-  branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch7x7_1, branch7x7_2, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
+    branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
+    branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
+    branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch7x7_1, branch7x7_2, branch_pool)
 end
 
 """
@@ -85,21 +77,19 @@ Create an Inception-v3 style-D module
 (ref: [pytorch](https://github.com/pytorch/vision/blob/6db1569c89094cf23f3bc41f79275c45e9fcb3f3/torchvision/models/inception.py#L322)).
 
 # Arguments
-- `inplanes`: number of input feature maps
+
+  - `inplanes`: number of input feature maps
 """
 function inception_d(inplanes)
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                    conv_bn((3, 3), 192, 320; stride = 2)...)
-
-  branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                      conv_bn((1, 7), 192, 192; pad = (0, 3))...,
-                      conv_bn((7, 1), 192, 192; pad = (3, 0))...,
-                      conv_bn((3, 3), 192, 192; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride=2)
-
-  return Parallel(cat_channels,
-                  branch3x3, branch7x7x3, branch_pool)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                      conv_bn((3, 3), 192, 320; stride = 2)...)
+    branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                        conv_bn((1, 7), 192, 192; pad = (0, 3))...,
+                        conv_bn((7, 1), 192, 192; pad = (3, 0))...,
+                        conv_bn((3, 3), 192, 192; stride = 2)...)
+    branch_pool = MaxPool((3, 3); stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3, branch7x7x3, branch_pool)
 end
 
 """
@@ -109,33 +99,29 @@ Create an Inception-v3 style-E module
 (ref: Fig. 7 in [paper](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
-- `inplanes`: number of input feature maps
+
+  - `inplanes`: number of input feature maps
 """
 function inception_e(inplanes)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
-
-  branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
-  branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
-                      conv_bn((3, 3), 448, 384; pad = 1)...)
-  branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1,
-                  Chain(branch3x3_1,
-                        Parallel(cat_channels,
-                                  branch3x3_1a, branch3x3_1b)),
-
-                  Chain(branch3x3_2,
-                        Parallel(cat_channels,
-                                  branch3x3_2a, branch3x3_2b)),
-                  branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
+    branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
+    branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
+                        conv_bn((3, 3), 448, 384; pad = 1)...)
+    branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1,
+                    Chain(branch3x3_1,
+                          Parallel(cat_channels,
+                                   branch3x3_1a, branch3x3_1b)),
+                    Chain(branch3x3_2,
+                          Parallel(cat_channels,
+                                   branch3x3_2a, branch3x3_2b)),
+                    branch_pool)
 end
 
 """
@@ -144,36 +130,37 @@ end
 Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 
 # Arguments
-- `nclasses`: the number of output classes
+
+  - `nclasses`: the number of output classes
 
 !!! warning
+    
     `inception3` does not currently support pretrained weights.
 """
 function inception3(; nclasses = 1000)
-  layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
-                      conv_bn((3, 3), 32, 32)...,
-                      conv_bn((3, 3), 32, 64; pad = 1)...,
-                      MaxPool((3, 3), stride = 2),
-                      conv_bn((1, 1), 64, 80)...,
-                      conv_bn((3, 3), 80, 192)...,
-                      MaxPool((3, 3), stride = 2),
-                      inception_a(192, 32),
-                      inception_a(256, 64),
-                      inception_a(288, 64),
-                      inception_b(288),
-                      inception_c(768, 128),
-                      inception_c(768, 160),
-                      inception_c(768, 160),
-                      inception_c(768, 192),
-                      inception_d(768),
-                      inception_e(1280),
-                      inception_e(2048)),
-                Chain(AdaptiveMeanPool((1, 1)),
-                      Dropout(0.2),
-                      MLUtils.flatten,
-                      Dense(2048, nclasses)))
-
-  return layer
+    layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
+                        conv_bn((3, 3), 32, 32)...,
+                        conv_bn((3, 3), 32, 64; pad = 1)...,
+                        MaxPool((3, 3); stride = 2),
+                        conv_bn((1, 1), 64, 80)...,
+                        conv_bn((3, 3), 80, 192)...,
+                        MaxPool((3, 3); stride = 2),
+                        inception_a(192, 32),
+                        inception_a(256, 64),
+                        inception_a(288, 64),
+                        inception_b(288),
+                        inception_c(768, 128),
+                        inception_c(768, 160),
+                        inception_c(768, 160),
+                        inception_c(768, 192),
+                        inception_d(768),
+                        inception_e(1280),
+                        inception_e(2048)),
+                  Chain(AdaptiveMeanPool((1, 1)),
+                        Dropout(0.2),
+                        MLUtils.flatten,
+                        Dense(2048, nclasses)))
+    return layer
 end
 
 """
@@ -183,21 +170,22 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
 See also [`inception3`](#).
 
 # Arguments
-- `pretrain`: set to `true` to load the pre-trained weights for ImageNet
-- `nclasses`: the number of output classes
+
+  - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
+  - `nclasses`: the number of output classes
 
 !!! warning
+    
     `Inception3` does not currently support pretrained weights.
 """
 struct Inception3
-  layers
+    layers::Any
 end
 
 function Inception3(; pretrain = false, nclasses = 1000)
-  layers = inception3(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "Inception3")
-
-  Inception3(layers)
+    layers = inception3(; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "Inception3")
+    return Inception3(layers)
 end
 
 @functor Inception3
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
index 2dfd06f8d..f2f85a383 100644
--- a/src/convnets/mobilenet.jl
+++ b/src/convnets/mobilenet.jl
@@ -10,81 +10,86 @@
 Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)).
 
 # Arguments
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper)
-- `configs`: A "list of tuples" configuration for each layer that details:
-  - `dw`: Set true to use a depthwise separable convolution or false for regular convolution
-  - `o`: The number of output feature maps
-  - `s`: The stride of the convolutional kernel
-  - `r`: The number of time this configuration block is repeated
-- `activate`: The activation function to use throughout the network
-- `inchannels`: The number of input feature maps``
-- `fcsize`: The intermediate fully-connected size between the convolution and final layers
-- `nclasses`: The number of output classes
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper)
+
+  - `configs`: A "list of tuples" configuration for each layer that details:
+    
+      + `dw`: Set true to use a depthwise separable convolution or false for regular convolution
+      + `o`: The number of output feature maps
+      + `s`: The stride of the convolutional kernel
+      + `r`: The number of time this configuration block is repeated
+  - `activate`: The activation function to use throughout the network
+  - `inchannels`: The number of input feature maps``
+  - `fcsize`: The intermediate fully-connected size between the convolution and final layers
+  - `nclasses`: The number of output classes
 """
 function mobilenetv1(width_mult, config;
                      activation = relu,
                      inchannels = 3,
                      nclasses = 1000,
                      fcsize = 1024)
-  layers = []
-  for (dw, outch, stride, nrepeats) in config
-    outch = Int(outch * width_mult)
-    for _ in 1:nrepeats
-      layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
-                                         stride = stride, pad = 1) :
-                   conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
-      append!(layers, layer)
-      inchannels = outch
+    layers = []
+    for (dw, outch, stride, nrepeats) in config
+        outch = Int(outch * width_mult)
+        for _ in 1:nrepeats
+            layer = dw ?
+                    depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
+                                          stride = stride, pad = 1) :
+                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
+            append!(layers, layer)
+            inchannels = outch
+        end
     end
-  end
 
-  return Chain(Chain(layers),
-               Chain(GlobalMeanPool(),
-                     MLUtils.flatten,
-                     Dense(inchannels, fcsize, activation),
-                     Dense(fcsize, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(GlobalMeanPool(),
+                       MLUtils.flatten,
+                       Dense(inchannels, fcsize, activation),
+                       Dense(fcsize, nclasses)))
 end
 
 const mobilenetv1_configs = [
-#     dw,    c, s, r
-  (false,   32, 2, 1),
-  ( true,   64, 1, 1),
-  ( true,  128, 2, 1),
-  ( true,  128, 1, 1),
-  ( true,  256, 2, 1),
-  ( true,  256, 1, 1),
-  ( true,  512, 2, 1),
-  ( true,  512, 1, 5),
-  ( true, 1024, 2, 1),
-  ( true, 1024, 1, 1)
+    #     dw,    c, s, r
+    (false, 32, 2, 1),
+    (true, 64, 1, 1),
+    (true, 128, 2, 1),
+    (true, 128, 1, 1),
+    (true, 256, 2, 1),
+    (true, 256, 1, 1),
+    (true, 512, 2, 1),
+    (true, 512, 1, 5),
+    (true, 1024, 2, 1),
+    (true, 1024, 1, 1),
 ]
 
 """
     MobileNetv1(width_mult = 1; pretrain = false, nclasses = 1000)
 
-Create a MobileNetv1 model with the baseline configuration 
+Create a MobileNetv1 model with the baseline configuration
 ([reference](https://arxiv.org/abs/1704.04861v1)).
 Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 
 # Arguments
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper;
-                 this is usually a value between 0.1 and 1.4)
-- `pretrain`: Whether to load the pre-trained weights for ImageNet
-- `nclasses`: The number of output classes
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper;
+    this is usually a value between 0.1 and 1.4)
+  - `pretrain`: Whether to load the pre-trained weights for ImageNet
+  - `nclasses`: The number of output classes
 
 See also [`Metalhead.mobilenetv1`](#).
 """
 struct MobileNetv1
-  layers
+    layers::Any
 end
 
 function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv1"))
+    layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv1"))
 
-  return MobileNetv1(layers)
+    return MobileNetv1(layers)
 end
 
 @functor MobileNetv1
@@ -103,56 +108,60 @@ Create a MobileNetv2 model.
 ([reference](https://arxiv.org/abs/1801.04381)).
 
 # Arguments
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper)
-- `configs`: A "list of tuples" configuration for each layer that details:
-  - `t`: The expansion factor that controls the number of feature maps in the bottleneck layer
-  - `c`: The number of output feature maps
-  - `n`: The number of times a block is repeated
-  - `s`: The stride of the convolutional kernel
-  - `a`: The activation function used in the bottleneck layer
-- `max_width`: The maximum number of feature maps in any layer of the network
-- `nclasses`: The number of output classes
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper)
+
+  - `configs`: A "list of tuples" configuration for each layer that details:
+    
+      + `t`: The expansion factor that controls the number of feature maps in the bottleneck layer
+      + `c`: The number of output feature maps
+      + `n`: The number of times a block is repeated
+      + `s`: The stride of the convolutional kernel
+      + `a`: The activation function used in the bottleneck layer
+  - `max_width`: The maximum number of feature maps in any layer of the network
+  - `nclasses`: The number of output classes
 """
 function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
-
-  # building inverted residual blocks
-  for (t, c, n, s, a) in configs
-    outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
-    for i in 1:n
-      push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a;
-                                     stride = i == 1 ? s : 1))
-      inplanes = outplanes
+    # building first layer
+    inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes; stride = 2))
+    # building inverted residual blocks
+    for (t, c, n, s, a) in configs
+        outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
+        for i in 1:n
+            push!(layers,
+                  invertedresidual(3, inplanes, inplanes * t, outplanes, a;
+                                   stride = i == 1 ? s : 1))
+            inplanes = outplanes
+        end
     end
-  end
-
-  # building last several layers
-  outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
-                                 max_width
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses)))
+    # building last several layers
+    outplanes = (width_mult > 1) ?
+                _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
+                max_width
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, outplanes, relu6; bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 # Layer configurations for MobileNetv2
 const mobilenetv2_configs = [
-#  t,   c, n, s,     a
-  (1,  16, 1, 1, relu6),
-  (6,  24, 2, 2, relu6),
-  (6,  32, 3, 2, relu6),
-  (6,  64, 4, 2, relu6),
-  (6,  96, 3, 1, relu6),
-  (6, 160, 3, 2, relu6),
-  (6, 320, 1, 1, relu6)
+    #  t,   c, n, s,     a
+    (1, 16, 1, 1, relu6),
+    (6, 24, 2, 2, relu6),
+    (6, 32, 3, 2, relu6),
+    (6, 64, 4, 2, relu6),
+    (6, 96, 3, 1, relu6),
+    (6, 160, 3, 2, relu6),
+    (6, 320, 1, 1, relu6),
 ]
 
 # Model definition for MobileNetv2
 struct MobileNetv2
-  layers
+    layers::Any
 end
 
 """
@@ -160,22 +169,22 @@ end
 
 Create a MobileNetv2 model with the specified configuration.
 ([reference](https://arxiv.org/abs/1801.04381)).
-Set `pretrain` to `true` to load the pretrained weights for ImageNet. 
+Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 
 # Arguments
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper;
-                 this is usually a value between 0.1 and 1.4)
-- `pretrain`: Whether to load the pre-trained weights for ImageNet
-- `nclasses`: The number of output classes
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper;
+    this is usually a value between 0.1 and 1.4)
+  - `pretrain`: Whether to load the pre-trained weights for ImageNet
+  - `nclasses`: The number of output classes
 
 See also [`Metalhead.mobilenetv2`](#).
 """
 function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv2"))
-
-  MobileNetv2(layers)
+    layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv2"))
+    return MobileNetv2(layers)
 end
 
 @functor MobileNetv2
@@ -194,85 +203,87 @@ Create a MobileNetv3 model.
 ([reference](https://arxiv.org/abs/1905.02244)).
 
 # Arguments
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper;
-                 this is usually a value between 0.1 and 1.4)
-- `configs`: a "list of tuples" configuration for each layer that details:
-  - `k::Integer` - The size of the convolutional kernel
-  - `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer
-  - `t::Integer` - The number of output feature maps for a given block
-  - `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
-  - `s::Integer` - The stride of the convolutional kernel
-  - `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`)
-- `max_width`: The maximum number of feature maps in any layer of the network
-- `nclasses`: the number of output classes
+
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper;
+    this is usually a value between 0.1 and 1.4)
+
+  - `configs`: a "list of tuples" configuration for each layer that details:
+    
+      + `k::Integer` - The size of the convolutional kernel
+      + `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer
+      + `t::Integer` - The number of output feature maps for a given block
+      + `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers
+      + `s::Integer` - The stride of the convolutional kernel
+      + `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`)
+  - `max_width`: The maximum number of feature maps in any layer of the network
+  - `nclasses`: the number of output classes
 """
 function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(16 * width_mult, 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
-  explanes = 0
-  # building inverted residual blocks
-  for (k, t, c, r, a, s) in configs
-    # inverted residual layers
-    outplanes = _round_channels(c * width_mult, 8)
-    explanes = _round_channels(inplanes * t, 8)
-    push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a;
-                                   stride = s, reduction = r))
-    inplanes = outplanes
-  end
-
-  # building last several layers
-  output_channel = max_width
-  output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel
-  classifier = Chain(Dense(explanes, output_channel, hardswish),
-                     Dropout(0.2),
-                     Dense(output_channel, nclasses))
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
+    # building first layer
+    inplanes = _round_channels(16 * width_mult, 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
+    explanes = 0
+    # building inverted residual blocks
+    for (k, t, c, r, a, s) in configs
+        # inverted residual layers
+        outplanes = _round_channels(c * width_mult, 8)
+        explanes = _round_channels(inplanes * t, 8)
+        push!(layers,
+              invertedresidual(k, inplanes, explanes, outplanes, a;
+                               stride = s, reduction = r))
+        inplanes = outplanes
+    end
+    # building last several layers
+    output_channel = max_width
+    output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) :
+                     output_channel
+    classifier = Chain(Dense(explanes, output_channel, hardswish),
+                       Dropout(0.2),
+                       Dense(output_channel, nclasses))
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, explanes, hardswish; bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
 end
 
 # Configurations for small and large mode for MobileNetv3
-mobilenetv3_configs = Dict(
-  :small => [
-  #  k,    t,  c,      SE,         a, s
-    (3,    1, 16,       4,      relu, 2),
-    (3,  4.5, 24, nothing,      relu, 2),
-    (3, 3.67, 24, nothing,      relu, 1),
-    (5,    4, 40,       4, hardswish, 2),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 2),
-    (5,    6, 96,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 1),
-  ], 
-  :large => [
-  #  k,   t,   c,      SE,         a, s
-    (3,   1,  16, nothing,      relu, 1),
-    (3,   4,  24, nothing,      relu, 2),
-    (3,   3,  24, nothing,      relu, 1),
-    (5,   3,  40,       4,      relu, 2),
-    (5,   3,  40,       4,      relu, 1),
-    (5,   3,  40,       4,      relu, 1),
-    (3,   6,  80, nothing, hardswish, 2),
-    (3, 2.5,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 2),
-    (5,   6, 160,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 1)
-  ]
-)
+mobilenetv3_configs = Dict(:small => [
+                               #  k,    t,  c,      SE,         a, s
+                               (3, 1, 16, 4, relu, 2),
+                               (3, 4.5, 24, nothing, relu, 2),
+                               (3, 3.67, 24, nothing, relu, 1),
+                               (5, 4, 40, 4, hardswish, 2),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 2),
+                               (5, 6, 96, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 1),
+                           ],
+                           :large => [
+                               #  k,   t,   c,      SE,         a, s
+                               (3, 1, 16, nothing, relu, 1),
+                               (3, 4, 24, nothing, relu, 2),
+                               (3, 3, 24, nothing, relu, 1),
+                               (5, 3, 40, 4, relu, 2),
+                               (5, 3, 40, 4, relu, 1),
+                               (5, 3, 40, 4, relu, 1),
+                               (3, 6, 80, nothing, hardswish, 2),
+                               (3, 2.5, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 2),
+                               (5, 6, 160, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 1),
+                           ])
 
 # Model definition for MobileNetv3
 struct MobileNetv3
-  layers
+    layers::Any
 end
 
 """
@@ -283,22 +294,24 @@ Create a MobileNetv3 model with the specified configuration.
 Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 # Arguments
-- `mode`: :small or :large for the size of the model (see paper).
-- `width_mult`: Controls the number of output feature maps in each block
-                (with 1.0 being the default in the paper;
-                 this is usually a value between 0.1 and 1.4)
-- `pretrain`: whether to load the pre-trained weights for ImageNet
-- `nclasses`: the number of output classes
+
+  - `mode`: :small or :large for the size of the model (see paper).
+  - `width_mult`: Controls the number of output feature maps in each block
+    (with 1.0 being the default in the paper;
+    this is usually a value between 0.1 and 1.4)
+  - `pretrain`: whether to load the pre-trained weights for ImageNet
+  - `nclasses`: the number of output classes
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  @assert mode in [:large, :small] "`mode` has to be either :large or :small"
-
-  max_width = (mode == :large) ? 1280 : 1024
-  layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
-  MobileNetv3(layers)
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false,
+                     nclasses = 1000)
+    @assert mode in [:large, :small] "`mode` has to be either :large or :small"
+    max_width = (mode == :large) ? 1280 : 1024
+    layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width,
+                         nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
+    return MobileNetv3(layers)
 end
 
 @functor MobileNetv3
diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl
index d91d65d6a..1a84bac68 100644
--- a/src/convnets/resnet.jl
+++ b/src/convnets/resnet.jl
@@ -5,15 +5,18 @@ Create a basic residual block
 ([reference](https://arxiv.org/abs/1512.03385v1)).
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: a list of the number of output feature maps for each convolution
-               within the residual block
-- `downsample`: set to `true` to downsample the input
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: a list of the number of output feature maps for each convolution
+    within the residual block
+  - `downsample`: set to `true` to downsample the input
 """
 function basicblock(inplanes, outplanes, downsample = false)
-  stride = downsample ? 2 : 1
-  Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    return Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1,
+                         bias = false)...,
+                 conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1,
+                         bias = false)...)
 end
 
 """
@@ -28,17 +31,21 @@ This version is standard across various ML frameworks.
 The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead.
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: a list of the number of output feature maps for each convolution
-               within the residual block
-- `downsample`: set to `true` to downsample the input
-- `stride`: a list of the stride of the 3 convolutional layers
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: a list of the number of output feature maps for each convolution
+    within the residual block
+  - `downsample`: set to `true` to downsample the input
+  - `stride`: a list of the stride of the 3 convolutional layers
 """
 function bottleneck(inplanes, outplanes, downsample = false;
                     stride = [1, (downsample ? 2 : 1), 1])
-  Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)...,
-        conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...)
+    return Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1],
+                         bias = false)...,
+                 conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1,
+                         bias = false)...,
+                 conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3],
+                         bias = false)...)
 end
 
 """
@@ -50,13 +57,16 @@ Create a bottleneck residual block
 layer which has a stride of 2.
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: a list of the number of output feature maps for each convolution
-               within the residual block
-- `downsample`: set to `true` to downsample the input
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: a list of the number of output feature maps for each convolution
+    within the residual block
+  - `downsample`: set to `true` to downsample the input
 """
-bottleneck_v1(inplanes, outplanes, downsample = false) =
-    bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1])
+function bottleneck_v1(inplanes, outplanes, downsample = false)
+    return bottleneck(inplanes, outplanes, downsample;
+                      stride = [(downsample ? 2 : 1), 1, 1])
+end
 
 """
     resnet(block, residuals::NTuple{2, Any}, connection = addrelu;
@@ -66,43 +76,48 @@ Create a ResNet model
 ([reference](https://arxiv.org/abs/1512.03385v1)).
 
 # Arguments
-- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
-           a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
-- `residuals`: a 2-tuple of functions with input `(inplanes, outplanes, downsample=false)`,
-               each of which will return a function that will be used as a new "skip" path to match a residual block.
-              [`Metalhead.skip_identity`](#) and [`Metalhead.skip_projection`](#) can be used here.
-- `connection`: the binary function applied to the output of residual and skip paths in a block
-- `channel_config`: the growth rate of the output feature maps within a residual block
-- `block_config`: a list of the number of residual blocks at each stage
-- `nclasses`: the number of output classes
+
+  - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
+    a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
+  - `residuals`: a 2-tuple of functions with input `(inplanes, outplanes, downsample=false)`,
+    each of which will return a function that will be used as a new "skip" path to match a residual block.
+    [`Metalhead.skip_identity`](#) and [`Metalhead.skip_projection`](#) can be used here.
+  - `connection`: the binary function applied to the output of residual and skip paths in a block
+  - `channel_config`: the growth rate of the output feature maps within a residual block
+  - `block_config`: a list of the number of residual blocks at each stage
+  - `nclasses`: the number of output classes
 """
 function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu;
                 channel_config, block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 64
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes .* channel_config
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1),
-                                       residuals[i][1](inplanes, outplanes[end], i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes[end]
-    for _ in 2:nrepeats
-      push!(layers, Parallel(connection, block(inplanes, outplanes, false),
-                                         residuals[i][2](inplanes, outplanes[end], false)))
-      inplanes = outplanes[end]
+    inplanes = 64
+    baseplanes = 64
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
+    push!(layers, MaxPool((3, 3); stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes .* channel_config
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection, block(inplanes, outplanes, i != 1),
+                       residuals[i][1](inplanes, outplanes[end], i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes[end]
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection, block(inplanes, outplanes, false),
+                           residuals[i][2](inplanes, outplanes[end], false)))
+            inplanes = outplanes[end]
+        end
+        # next set of output plane base is doubled
+        baseplanes *= 2
     end
     # next set of output plane base is doubled
     baseplanes *= 2
-  end
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -113,45 +128,46 @@ Create a ResNet model
 ([reference](https://arxiv.org/abs/1512.03385v1)).
 
 # Arguments
-- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
-           a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
-- `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`)
-    - `:A`: uses a [`Metalhead.skip_identity`](#) for all residual blocks
-    - `:B`: uses a [`Metalhead.skip_projection`](#) for the first residual block
-            and [`Metalhead.skip_identity`](@) for the remaining residual blocks
-    - `:C`: uses a [`Metalhead.skip_projection`](#) for all residual blocks
-- `connection`: the binary function applied to the output of residual and skip paths in a block
-- `channel_config`: the growth rate of the output feature maps within a residual block
-- `block_config`: a list of the number of residual blocks at each stage
-- `nclasses`: the number of output classes
+
+  - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
+    a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
+
+  - `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`)
+    
+      + `:A`: uses a [`Metalhead.skip_identity`](#) for all residual blocks
+      + `:B`: uses a [`Metalhead.skip_projection`](#) for the first residual block
+        and [`Metalhead.skip_identity`](@) for the remaining residual blocks
+      + `:C`: uses a [`Metalhead.skip_projection`](#) for all residual blocks
+  - `connection`: the binary function applied to the output of residual and skip paths in a block
+  - `channel_config`: the growth rate of the output feature maps within a residual block
+  - `block_config`: a list of the number of residual blocks at each stage
+  - `nclasses`: the number of output classes
 """
 function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...)
-  shortcut_dict = Dict(
-    :A => (skip_identity, skip_identity),
-    :B => (skip_projection, skip_identity),
-    :C => (skip_projection, skip_projection))
-
-  if any(sc -> !haskey(shortcut_dict,sc),shortcut_config)
-    error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
-  end
-
-  shortcut = [shortcut_dict[sc] for sc in shortcut_config]
-  resnet(block, shortcut, args...; kwargs...)
+    shortcut_dict = Dict(:A => (skip_identity, skip_identity),
+                         :B => (skip_projection, skip_identity),
+                         :C => (skip_projection, skip_projection))
+    if any(sc -> !haskey(shortcut_dict, sc), shortcut_config)
+        error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
+    end
+    shortcut = [shortcut_dict[sc] for sc in shortcut_config]
+    return resnet(block, shortcut, args...; kwargs...)
 end
 
 function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...)
-    resnet(block, fill(shortcut_config, length(block_config)), args...;
-           block_config = block_config, kwargs...)
+    return resnet(block, fill(shortcut_config, length(block_config)), args...;
+                  block_config = block_config, kwargs...)
 end
 
-resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...)
+function resnet(block, residuals::NTuple{2}, args...; kwargs...)
+    return resnet(block, [residuals], args...; kwargs...)
+end
 
-const resnet_config =
-  Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
-       34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
-       50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
-       101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
-       152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
+const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
+                           34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
+                           50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
+                           101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
+                           152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
 
 """
     ResNet(channel_config, block_config, shortcut_config;
@@ -162,30 +178,30 @@ Create a `ResNet` model
 See also [`resnet`](#).
 
 # Arguments
-- `channel_config`: the growth rate of the output feature maps within a residual block
-- `block_config`: a list of the number of residual blocks at each stage
-- `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`).
-   `shortcut_config` can also be a vector of symbols if different shortcut styles are applied to
-   different residual blocks.
-- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
-           a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
-- `connection`: the binary function applied to the output of residual and skip paths in a block
-- `nclasses`: the number of output classes
+
+  - `channel_config`: the growth rate of the output feature maps within a residual block
+  - `block_config`: a list of the number of residual blocks at each stage
+  - `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`).
+    `shortcut_config` can also be a vector of symbols if different shortcut styles are applied to
+    different residual blocks.
+  - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns
+    a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#))
+  - `connection`: the binary function applied to the output of residual and skip paths in a block
+  - `nclasses`: the number of output classes
 """
 struct ResNet
-  layers
+    layers::Any
 end
 
 function ResNet(channel_config, block_config, shortcut_config;
                 block, connection = addrelu, nclasses = 1000)
-  layers = resnet(block,
-                  shortcut_config,
-                  connection;
-                  channel_config = channel_config,
-                  block_config = block_config,
-                  nclasses = nclasses)
-
-  ResNet(layers)
+    layers = resnet(block,
+                    shortcut_config,
+                    connection;
+                    channel_config = channel_config,
+                    block_config = block_config,
+                    nclasses = nclasses)
+    return ResNet(layers)
 end
 
 @functor ResNet
@@ -206,10 +222,12 @@ referred as ResNet v1.5.
 See also [`Metalhead.resnet`](#).
 
 # Arguments
-- `depth`: depth of the ResNet model. Options include (18, 34, 50, 101, 152).
-- `nclasses`: the number of output classes
+
+  - `depth`: depth of the ResNet model. Options include (18, 34, 50, 101, 152).
+  - `nclasses`: the number of output classes
 
 !!! warning
+    
     Only `ResNet(50)` currently supports pretrained weights.
 
 For `ResNet(18)` and `ResNet(34)`, the parameter-free shortcut style (type `:A`)
@@ -242,7 +260,7 @@ function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000)
     config, block = resnet_config[depth]
     model = ResNet(config...; block = block, nclasses = nclasses)
     pretrain && loadpretrain!(model, string("ResNet", depth))
-    model
+    return model
 end
 
 # Compat with Metalhead 0.6; remove in 0.7
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
index eaa66f98f..b8ed03bb0 100644
--- a/src/convnets/resnext.jl
+++ b/src/convnets/resnext.jl
@@ -5,19 +5,20 @@ Create a basic residual block as defined in the paper for ResNeXt
 ([reference](https://arxiv.org/abs/1611.05431)).
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: the number of output feature maps 
-- `cardinality`: the number of groups to use for the convolution
-- `width`: the number of feature maps in each group in the bottleneck
-- `downsample`: set to `true` to downsample the input
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: the number of output feature maps
+  - `cardinality`: the number of groups to use for the convolution
+  - `width`: the number of feature maps in each group in the bottleneck
+  - `downsample`: set to `true` to downsample the input
 """
 function resnextblock(inplanes, outplanes, cardinality, width, downsample = false)
-  stride = downsample ? 2 : 1
-  hidden_channels = cardinality * width
-  return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
-               conv_bn((3, 3), hidden_channels, hidden_channels;
-                        stride = stride, pad = 1, bias = false, groups = cardinality)...,
-               conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    hidden_channels = cardinality * width
+    return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
+                 conv_bn((3, 3), hidden_channels, hidden_channels;
+                         stride = stride, pad = 1, bias = false, groups = cardinality)...,
+                 conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
 end
 
 """
@@ -28,40 +29,46 @@ Create a ResNeXt model
 ([reference](https://arxiv.org/abs/1611.05431)).
 
 # Arguments
-- `cardinality`: the number of groups to use for the convolution
-- `width`: the number of feature maps in each group in the bottleneck
-- `widen_factor`: the factor by which the width of the bottleneck is increased after each stage
-- `connection`: the binary function applied to the output of residual and skip paths in a block
-- `block_config`: a list of the number of residual blocks at each stage
-- `nclasses`: the number of output classes
+
+  - `cardinality`: the number of groups to use for the convolution
+  - `width`: the number of feature maps in each group in the bottleneck
+  - `widen_factor`: the factor by which the width of the bottleneck is increased after each stage
+  - `connection`: the binary function applied to the output of residual and skip paths in a block
+  - `block_config`: a list of the number of residual blocks at each stage
+  - `nclasses`: the number of output classes
 """
-function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y);
+function resnext(cardinality, width, widen_factor = 2,
+                 connection = (x, y) -> @. relu(x) + relu(y);
                  block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 128
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes * widen_factor
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1),
-                                       skip_projection(inplanes, outplanes, i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes
-    for _ in 2:nrepeats
-        push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false),
-                                           skip_identity(inplanes, outplanes, false)))
+    inplanes = 64
+    baseplanes = 128
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
+    push!(layers, MaxPool((3, 3); stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes * widen_factor
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection,
+                       resnextblock(inplanes, outplanes, cardinality, width, i != 1),
+                       skip_projection(inplanes, outplanes, i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection,
+                           resnextblock(inplanes, outplanes, cardinality, width, false),
+                           skip_identity(inplanes, outplanes, false)))
+        end
+        baseplanes = outplanes
+        # double width after every cluster of blocks
+        width *= widen_factor
     end
-    baseplanes = outplanes
-    # double width after every cluster of blocks
-    width *= widen_factor
-  end
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -71,18 +78,19 @@ Create a ResNeXt model
 ([reference](https://arxiv.org/abs/1611.05431)).
 
 # Arguments
-- `cardinality`: the number of groups to use for the convolution
-- `width`: the number of feature maps in each group in the bottleneck
-- `block_config`: a list of the number of residual blocks at each stage
-- `nclasses`: the number of output classes
+
+  - `cardinality`: the number of groups to use for the convolution
+  - `width`: the number of feature maps in each group in the bottleneck
+  - `block_config`: a list of the number of residual blocks at each stage
+  - `nclasses`: the number of output classes
 """
 struct ResNeXt
-  layers
+    layers::Any
 end
 
 function ResNeXt(cardinality, width; block_config, nclasses = 1000)
-  layers = resnext(cardinality, width; block_config, nclasses)
-  ResNeXt(layers)
+    layers = resnext(cardinality, width; block_config, nclasses)
+    return ResNeXt(layers)
 end
 
 @functor ResNeXt
@@ -92,11 +100,9 @@ end
 backbone(m::ResNeXt) = m.layers[1]
 classifier(m::ResNeXt) = m.layers[2]
 
-const resnext_config = Dict(
-  50 => (3, 4, 6, 3),
-  101 => (3, 4, 23, 3),
-  152 => (3, 8, 36, 3)
-)
+const resnext_config = Dict(50 => (3, 4, 6, 3),
+                            101 => (3, 4, 23, 3),
+                            152 => (3, 8, 36, 3))
 
 """
     ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
@@ -106,14 +112,16 @@ Create a ResNeXt model with specified configuration. Currently supported values
 Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 !!! warning
-  `ResNeXt` does not currently support pretrained weights.
+    
+
+`ResNeXt` does not currently support pretrained weights.
 
 See also [`Metalhead.resnext`](#).
 """
-function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
-  @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
-
-  model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
-  pretrain && loadpretrain!(model, string("ResNeXt", config))
-  model
-end
\ No newline at end of file
+function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false,
+                 nclasses = 1000)
+    @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
+    model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
+    pretrain && loadpretrain!(model, string("ResNeXt", config))
+    return model
+end
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 169ad2e86..c4de36acc 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -5,20 +5,21 @@ Create a fire module
 ([reference](https://arxiv.org/abs/1602.07360v4)).
 
 # Arguments
-- `inplanes`: number of input feature maps
-- `squeeze_planes`: number of intermediate feature maps
-- `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution
-- `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
+
+  - `inplanes`: number of input feature maps
+  - `squeeze_planes`: number of intermediate feature maps
+  - `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution
+  - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
 """
 function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
-  branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
-  branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
-  branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu)
+    branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
+    branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
+    branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, relu; pad = 1)
 
-  return Chain(branch_1,
-               Parallel(cat_channels,
-                        branch_2,
-                        branch_3))
+    return Chain(branch_1,
+                 Parallel(cat_channels,
+                          branch_2,
+                          branch_3))
 end
 
 """
@@ -28,24 +29,24 @@ Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
 """
 function squeezenet()
-  layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2),
-                       MaxPool((3, 3), stride = 2),
-                       fire(64, 16, 64, 64),
-                       fire(128, 16, 64, 64),
-                       MaxPool((3, 3), stride = 2),
-                       fire(128, 32, 128, 128),
-                       fire(256, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2),
-                       fire(256, 48, 192, 192),
-                       fire(384, 48, 192, 192),
-                       fire(384, 64, 256, 256),
-                       fire(512, 64, 256, 256),
-                       Dropout(0.5),
-                       Conv((1, 1), 512 => 1000, relu)),
-                 AdaptiveMeanPool((1, 1)),
-                 MLUtils.flatten)
+    layers = Chain(Chain(Conv((3, 3), 3 => 64, relu; stride = 2),
+                         MaxPool((3, 3); stride = 2),
+                         fire(64, 16, 64, 64),
+                         fire(128, 16, 64, 64),
+                         MaxPool((3, 3); stride = 2),
+                         fire(128, 32, 128, 128),
+                         fire(256, 32, 128, 128),
+                         MaxPool((3, 3); stride = 2),
+                         fire(256, 48, 192, 192),
+                         fire(384, 48, 192, 192),
+                         fire(384, 64, 256, 256),
+                         fire(512, 64, 256, 256),
+                         Dropout(0.5),
+                         Conv((1, 1), 512 => 1000, relu)),
+                   AdaptiveMeanPool((1, 1)),
+                   MLUtils.flatten)
 
-  return layers
+    return layers
 end
 
 """
@@ -56,19 +57,19 @@ Create a SqueezeNet
 Set `pretrain=true` to load the model with pre-trained weights for ImageNet.
 
 !!! warning
+    
     `SqueezeNet` does not currently support pretrained weights.
 
 See also [`squeezenet`](#).
 """
 struct SqueezeNet
-  layers
+    layers::Any
 end
 
 function SqueezeNet(; pretrain = false)
-  layers = squeezenet()
-  pretrain && loadpretrain!(layers, "SqueezeNet")
-
-  SqueezeNet(layers)
+    layers = squeezenet()
+    pretrain && loadpretrain!(layers, "SqueezeNet")
+    return SqueezeNet(layers)
 end
 
 @functor SqueezeNet
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index bdca0d9ee..423bcdf53 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -5,24 +5,26 @@ A VGG block of convolution layers
 ([reference](https://arxiv.org/abs/1409.1556v6)).
 
 # Arguments
-- `ifilters`: number of input feature maps
-- `ofilters`: number of output feature maps
-- `depth`: number of convolution/convolution + batch norm layers
-- `batchnorm`: set to `true` to include batch normalization after each convolution
+
+  - `ifilters`: number of input feature maps
+  - `ofilters`: number of output feature maps
+  - `depth`: number of convolution/convolution + batch norm layers
+  - `batchnorm`: set to `true` to include batch normalization after each convolution
 """
 function vgg_block(ifilters, ofilters, depth, batchnorm)
-  k = (3,3)
-  p = (1,1)
-  layers = []
-  for _ in 1:depth
-    if batchnorm
-      append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
-    else
-      push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
+    k = (3, 3)
+    p = (1, 1)
+    layers = []
+    for _ in 1:depth
+        if batchnorm
+            append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
+        else
+            push!(layers, Conv(k, ifilters => ofilters, relu; pad = p))
+        end
+        ifilters = ofilters
     end
     ifilters = ofilters
-  end
-  return layers
+    return layers
 end
 
 """
@@ -32,20 +34,21 @@ Create VGG convolution layers
 ([reference](https://arxiv.org/abs/1409.1556v6)).
 
 # Arguments
-- `config`: vector of tuples `(output_channels, num_convolutions)`
-            for each block (see [`Metalhead.vgg_block`](#))
-- `batchnorm`: set to `true` to include batch normalization after each convolution
-- `inchannels`: number of input channels
+
+  - `config`: vector of tuples `(output_channels, num_convolutions)`
+    for each block (see [`Metalhead.vgg_block`](#))
+  - `batchnorm`: set to `true` to include batch normalization after each convolution
+  - `inchannels`: number of input channels
 """
 function vgg_convolutional_layers(config, batchnorm, inchannels)
-  layers = []
-  ifilters = inchannels
-  for c in config
-    append!(layers, vgg_block(ifilters, c..., batchnorm))
-    push!(layers, MaxPool((2,2), stride=2))
-    ifilters, _ = c
-  end
-  return layers
+    layers = []
+    ifilters = inchannels
+    for c in config
+        append!(layers, vgg_block(ifilters, c..., batchnorm))
+        push!(layers, MaxPool((2, 2); stride = 2))
+        ifilters, _ = c
+    end
+    return layers
 end
 
 """
@@ -55,19 +58,20 @@ Create VGG classifier (fully connected) layers
 ([reference](https://arxiv.org/abs/1409.1556v6)).
 
 # Arguments
-- `imsize`: tuple `(width, height, channels)` indicating the size after
-            the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#))
-- `nclasses`: number of output classes
-- `fcsize`: input and output size of the intermediate fully connected layer
-- `dropout`: the dropout level between each fully connected layer
+
+  - `imsize`: tuple `(width, height, channels)` indicating the size after
+    the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#))
+  - `nclasses`: number of output classes
+  - `fcsize`: input and output size of the intermediate fully connected layer
+  - `dropout`: the dropout level between each fully connected layer
 """
 function vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(MLUtils.flatten,
-               Dense(Int(prod(imsize)), fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, nclasses))
+    return Chain(MLUtils.flatten,
+                 Dense(Int(prod(imsize)), fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, nclasses))
 end
 
 """
@@ -77,27 +81,28 @@ Create a VGG model
 ([reference](https://arxiv.org/abs/1409.1556v6)).
 
 # Arguments
-- `imsize`: input image width and height as a tuple
-- `config`: the configuration for the convolution layers
-            (see [`Metalhead.vgg_convolutional_layers`](#))
-- `inchannels`: number of input channels
-- `batchnorm`: set to `true` to use batch normalization after each convolution
-- `nclasses`: number of output classes
-- `fcsize`: intermediate fully connected layer size
-            (see [`Metalhead.vgg_classifier_layers`](#))
-- `dropout`: dropout level between fully connected layers
+
+  - `imsize`: input image width and height as a tuple
+  - `config`: the configuration for the convolution layers
+    (see [`Metalhead.vgg_convolutional_layers`](#))
+  - `inchannels`: number of input channels
+  - `batchnorm`: set to `true` to use batch normalization after each convolution
+  - `nclasses`: number of output classes
+  - `fcsize`: intermediate fully connected layer size
+    (see [`Metalhead.vgg_classifier_layers`](#))
+  - `dropout`: dropout level between fully connected layers
 """
 function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  conv = vgg_convolutional_layers(config, batchnorm, inchannels)
-  imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
-  class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(Chain(conv), class)
+    conv = vgg_convolutional_layers(config, batchnorm, inchannels)
+    imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
+    class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
+    return Chain(Chain(conv), class)
 end
 
-const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)],
-                             :B => [(64,2), (128,2), (256,2), (512,2), (512,2)],
-                             :D => [(64,2), (128,2), (256,3), (512,3), (512,3)],
-                             :E => [(64,2), (128,2), (256,4), (512,4), (512,4)])
+const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)],
+                             :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)],
+                             :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)],
+                             :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)])
 
 const vgg_config = Dict(11 => :A,
                         13 => :B,
@@ -105,7 +110,7 @@ const vgg_config = Dict(11 => :A,
                         19 => :E)
 
 struct VGG
-  layers
+    layers::Any
 end
 
 """
@@ -114,24 +119,25 @@ end
 Construct a VGG model with the specified input image size. Typically, the image size is `(224, 224)`.
 
 ## Keyword Arguments:
-- `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block 
-- `inchannels`::Integer : number of input channels
-- `batchnorm`::Bool : set to `true` to use batch normalization after each convolution
-- `nclasses`::Integer : number of output classes
-- `fcsize`: intermediate fully connected layer size
-            (see [`Metalhead.vgg_classifier_layers`](#))
-- `dropout`: dropout level between fully connected layers
+
+  - `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block
+  - `inchannels`::Integer : number of input channels
+  - `batchnorm`::Bool : set to `true` to use batch normalization after each convolution
+  - `nclasses`::Integer : number of output classes
+  - `fcsize`: intermediate fully connected layer size
+    (see [`Metalhead.vgg_classifier_layers`](#))
+  - `dropout`: dropout level between fully connected layers
 """
 function VGG(imsize::Dims{2};
              config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  layers = vgg(imsize; config = config,
-                       inchannels = inchannels,
-                       batchnorm = batchnorm,
-                       nclasses = nclasses,
-                       fcsize = fcsize,
-                       dropout = dropout)
-
-  VGG(layers)
+    layers = vgg(imsize; config = config,
+                 inchannels = inchannels,
+                 batchnorm = batchnorm,
+                 nclasses = nclasses,
+                 fcsize = fcsize,
+                 dropout = dropout)
+
+    return VGG(layers)
 end
 
 @functor VGG
@@ -149,27 +155,27 @@ Create a VGG style model with specified `depth`. Available values include (11, 1
 See also [`VGG`](#).
 
 !!! warning
+    
     `VGG` does not currently support pretrained weights.
 
 # Arguments
-- `pretrain`: set to `true` to load pre-trained model weights for ImageNet
+
+  - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
 """
 function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
-  @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
-
-  model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
-                          inchannels = 3,
-                          batchnorm = batchnorm,
-                          nclasses = nclasses,
-                          fcsize = 4096,
-                          dropout = 0.5)
-
-  if pretrain && !batchnorm
-    loadpretrain!(model, string("VGG", depth))
-  elseif pretrain
-    loadpretrain!(model, "VGG$(depth)-BN)")
-  end
-  model
+    @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
+    model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
+                inchannels = 3,
+                batchnorm = batchnorm,
+                nclasses = nclasses,
+                fcsize = 4096,
+                dropout = 0.5)
+    if pretrain && !batchnorm
+        loadpretrain!(model, string("VGG", depth))
+    elseif pretrain
+        loadpretrain!(model, "VGG$(depth)-BN)")
+    end
+    return model
 end
 
 # deprecations
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
index 10baf73e9..3d63ddad0 100644
--- a/src/layers/attention.jl
+++ b/src/layers/attention.jl
@@ -4,16 +4,17 @@
 Multi-head self-attention layer.
 
 # Arguments:
-- `nheads`: Number of heads
-- `qkv_layer`: layer to be used for getting the query, key and value
-- `attn_drop`: dropout rate after the self-attention layer
-- `projection`: projection layer to be used after self-attention
+
+  - `nheads`: Number of heads
+  - `qkv_layer`: layer to be used for getting the query, key and value
+  - `attn_drop`: dropout rate after the self-attention layer
+  - `projection`: projection layer to be used after self-attention
 """
 struct MHAttention{P, Q, R}
-  nheads::Int
-  qkv_layer::P
-  attn_drop::Q
-  projection::R
+    nheads::Int
+    qkv_layer::P
+    attn_drop::Q
+    projection::R
 end
 
 """
@@ -22,37 +23,38 @@ end
 Multi-head self-attention layer.
 
 # Arguments:
-- `planes`: number of input channels
-- `nheads`: number of heads
-- `qkv_bias`: whether to use bias in the layer to get the query, key and value
-- `attn_drop`: dropout rate after the self-attention layer
-- `proj_drop`: dropout rate after the projection layer
-"""
-function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.)
-  @assert planes % nheads == 0 "planes should be divisible by nheads"
-  qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
-  attn_drop = Dropout(attn_drop)
-  proj = Chain(Dense(planes, planes), Dropout(proj_drop))
 
-  MHAttention(nheads, qkv_layer, attn_drop, proj)
+  - `planes`: number of input channels
+  - `nheads`: number of heads
+  - `qkv_bias`: whether to use bias in the layer to get the query, key and value
+  - `attn_drop`: dropout rate after the self-attention layer
+  - `proj_drop`: dropout rate after the projection layer
+"""
+function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false,
+                     attn_drop = 0.0, proj_drop = 0.0)
+    @assert planes % nheads==0 "planes should be divisible by nheads"
+    qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
+    attn_drop = Dropout(attn_drop)
+    proj = Chain(Dense(planes, planes), Dropout(proj_drop))
+    return MHAttention(nheads, qkv_layer, attn_drop, proj)
 end
 
 @functor MHAttention
 
 function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
-  nfeatures, seq_len, batch_size = size(x)
-  x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
-  qkv = m.qkv_layer(x_reshaped)
-  qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
-  query, key, value = chunk(qkv_reshaped, 3; dims = 4)
-  scale = convert(T, sqrt(size(query, 1) / m.nheads))
-  key_reshaped = reshape(
-    permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size
-  )
-  query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
-  value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size))
-  y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
-  return reshape(y, :, seq_len, batch_size)
+    nfeatures, seq_len, batch_size = size(x)
+    x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
+    qkv = m.qkv_layer(x_reshaped)
+    qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
+    query, key, value = chunk(qkv_reshaped, 3; dims = 4)
+    scale = convert(T, sqrt(size(query, 1) / m.nheads))
+    key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads,
+                           seq_len * batch_size)
+    query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
+    value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    pre_projection = reshape(batched_mul(attention, value_reshaped),
+                             (nfeatures, seq_len, batch_size))
+    y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
+    return reshape(y, :, seq_len, batch_size)
 end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ca30df8a4..d9b631bb0 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -7,45 +7,44 @@
 Create a convolution + batch normalization pair with activation.
 
 # Arguments
-- `kernelsize`: size of the convolution kernel (tuple)
-- `inplanes`: number of input feature maps
-- `outplanes`: number of output feature maps
-- `activation`: the activation function for the final layer
-- `rev`: set to `true` to place the batch norm before the convolution
-- `preact`: set to `true` to place the activation function before the batch norm
-            (only compatible with `rev = false`)
-- `stride`: stride of the convolution kernel
-- `pad`: padding of the convolution kernel
-- `dilation`: dilation of the convolution kernel
-- `groups`: groups for the convolution kernel
-- `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
-- `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
-- `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
+
+  - `kernelsize`: size of the convolution kernel (tuple)
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+  - `activation`: the activation function for the final layer
+  - `rev`: set to `true` to place the batch norm before the convolution
+  - `preact`: set to `true` to place the activation function before the batch norm
+    (only compatible with `rev = false`)
+  - `stride`: stride of the convolution kernel
+  - `pad`: padding of the convolution kernel
+  - `dilation`: dilation of the convolution kernel
+  - `groups`: groups for the convolution kernel
+  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
+  - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
 """
 function conv_bn(kernelsize, inplanes, outplanes, activation = relu;
                  rev = false, preact = false,
-                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1,
+                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1,
                  kwargs...)
-  layers = []
-
-  if rev
-    activations = (conv = activation, bn = identity)
-    bnplanes = inplanes
-  else
-    activations = (conv = identity, bn = activation)
-    bnplanes = outplanes
-  end
-
-  if preact
-    rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
-          activations = (conv = activation, bn = identity)
-  end
-
-  push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
-  push!(layers, BatchNorm(Int(bnplanes), activations.bn;
-                          initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
-
-  return rev ? reverse(layers) : layers
+    layers = []
+    if rev
+        activations = (conv = activation, bn = identity)
+        bnplanes = inplanes
+    else
+        activations = (conv = identity, bn = activation)
+        bnplanes = outplanes
+    end
+    if preact
+        rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
+        activations = (conv = activation, bn = identity)
+    end
+    push!(layers,
+          Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
+    push!(layers,
+          BatchNorm(Int(bnplanes), activations.bn;
+                    initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
+    return rev ? reverse(layers) : layers
 end
 
 """
@@ -57,38 +56,41 @@ end
 
 Create a depthwise separable convolution chain as used in MobileNet v1.
 This is sequence of layers:
-- a `kernelsize` depthwise convolution from `inplanes => inplanes`
-- a batch norm layer + `activation`
-- a `kernelsize` convolution from `inplanes => outplanes`
-- a batch norm layer + `activation`
+
+  - a `kernelsize` depthwise convolution from `inplanes => inplanes`
+  - a batch norm layer + `activation`
+  - a `kernelsize` convolution from `inplanes => outplanes`
+  - a batch norm layer + `activation`
 
 See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
 
 # Arguments
-- `kernelsize`: size of the convolution kernel (tuple)
-- `inplanes`: number of input feature maps
-- `outplanes`: number of output feature maps
-- `activation`: the activation function for the final layer
-- `rev`: set to `true` to place the batch norm before the convolution
-- `stride`: stride of the first convolution kernel
-- `pad`: padding of the first convolution kernel
-- `dilation`: dilation of the first convolution kernel
-- `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
-- `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
-- `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
+
+  - `kernelsize`: size of the convolution kernel (tuple)
+  - `inplanes`: number of input feature maps
+  - `outplanes`: number of output feature maps
+  - `activation`: the activation function for the final layer
+  - `rev`: set to `true` to place the batch norm before the convolution
+  - `stride`: stride of the first convolution kernel
+  - `pad`: padding of the first convolution kernel
+  - `dilation`: dilation of the first convolution kernel
+  - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#))
+  - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
+  - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
 """
-depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
-                      rev = false,
-                      initβ = Flux.zeros32, initγ = Flux.ones32,
-                      ϵ = 1f-5, momentum = 1f-1,
-                      stride = 1, kwargs...) =
-  vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
-               rev = rev, initβ = initβ, initγ = initγ,
-               ϵ = ϵ, momentum = momentum,
-               stride = stride, groups = Int(inplanes), kwargs...),
-      conv_bn((1, 1), inplanes, outplanes, activation;
-              rev = rev, initβ = initβ, initγ = initγ,
-              ϵ = ϵ, momentum = momentum))
+function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
+                               rev = false,
+                               initβ = Flux.zeros32, initγ = Flux.ones32,
+                               ϵ = 1.0f-5, momentum = 1.0f-1,
+                               stride = 1, kwargs...)
+    return vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
+                        rev = rev, initβ = initβ, initγ = initγ,
+                        ϵ = ϵ, momentum = momentum,
+                        stride = stride, groups = Int(inplanes), kwargs...),
+                conv_bn((1, 1), inplanes, outplanes, activation;
+                        rev = rev, initβ = initβ, initγ = initγ,
+                        ϵ = ϵ, momentum = momentum))
+end
 
 """
     skip_projection(inplanes, outplanes, downsample = false)
@@ -97,13 +99,16 @@ Create a skip projection
 ([reference](https://arxiv.org/abs/1512.03385v1)).
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: the number of output feature maps
-- `downsample`: set to `true` to downsample the input
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: the number of output feature maps
+  - `downsample`: set to `true` to downsample the input
 """
-skip_projection(inplanes, outplanes, downsample = false) = downsample ?
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+function skip_projection(inplanes, outplanes, downsample = false)
+    return downsample ?
+           Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
+           Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+end
 
 # array -> PaddedView(0, array, outplanes) for zero padding arrays
 """
@@ -113,20 +118,22 @@ Create a identity projection
 ([reference](https://arxiv.org/abs/1512.03385v1)).
 
 # Arguments:
-- `inplanes`: the number of input feature maps
-- `outplanes`: the number of output feature maps
-- `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#).
+
+  - `inplanes`: the number of input feature maps
+  - `outplanes`: the number of output feature maps
+  - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#).
 """
 function skip_identity(inplanes, outplanes)
-  if outplanes > inplanes
-    return Chain(MaxPool((1, 1), stride = 2),
-                 y -> cat(y, zeros(eltype(y),
-                                   size(y, 1),
-                                   size(y, 2),
-                                   outplanes - inplanes, size(y, 4)); dims = 3))
-  else
-    return identity
-  end
+    if outplanes > inplanes
+        return Chain(MaxPool((1, 1); stride = 2),
+                     y -> cat(y,
+                              zeros(eltype(y),
+                                    size(y, 1),
+                                    size(y, 2),
+                                    outplanes - inplanes, size(y, 4)); dims = 3))
+    else
+        return identity
+    end
 end
 skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes)
 
@@ -137,15 +144,18 @@ Squeeze and excitation layer used by MobileNet variants
 ([reference](https://arxiv.org/abs/1905.02244)).
 
 # Arguments
-- `channels`: the number of input/output feature maps
-- `reduction = 4`: the reduction factor for the number of hidden feature maps
-                   (must be >= 1)
+
+  - `channels`: the number of input/output feature maps
+  - `reduction = 4`: the reduction factor for the number of hidden feature maps
+    (must be >= 1)
 """
 function squeeze_excite(channels, reduction = 4)
-  @assert (reduction >= 1) "`reduction` must be >= 1"
-  SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
-                       conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)...,
-                       conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
+    @assert (reduction>=1) "`reduction` must be >= 1"
+    return SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
+                                conv_bn((1, 1), channels, channels ÷ reduction, relu;
+                                        bias = false)...,
+                                conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...),
+                          .*)
 end
 
 """
@@ -156,31 +166,32 @@ Create a basic inverted residual block for MobileNet variants
 ([reference](https://arxiv.org/abs/1905.02244)).
 
 # Arguments
-- `kernel_size`: The kernel size of the convolutional layers
-- `inplanes`: The number of input feature maps
-- `hidden_planes`: The number of feature maps in the hidden layer
-- `outplanes`: The number of output feature maps
-- `activation`: The activation function for the first two convolution layer
-- `stride`: The stride of the convolutional kernel, has to be either 1 or 2
-- `reduction`: The reduction factor for the number of hidden feature maps
-               in a squeeze and excite layer (see [`squeeze_excite`](#)).
-               Must be >= 1 or `nothing` for no squeeze and excite layer.
-"""
-function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu;
-                          stride, reduction = nothing)
-  @assert stride in [1, 2] "`stride` has to be 1 or 2"
 
-  pad = @. (kernel_size - 1) ÷ 2
-  conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
-  selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
-
-  invres = Chain(conv1,
-                 conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
-                         bias = false, stride, pad = pad, groups = hidden_planes)...,
-                 selayer,
-                 conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
+  - `kernel_size`: The kernel size of the convolutional layers
+  - `inplanes`: The number of input feature maps
+  - `hidden_planes`: The number of feature maps in the hidden layer
+  - `outplanes`: The number of output feature maps
+  - `activation`: The activation function for the first two convolution layer
+  - `stride`: The stride of the convolutional kernel, has to be either 1 or 2
+  - `reduction`: The reduction factor for the number of hidden feature maps
+    in a squeeze and excite layer (see [`squeeze_excite`](#)).
+    Must be >= 1 or `nothing` for no squeeze and excite layer.
+"""
+function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
+                          activation = relu; stride, reduction = nothing)
+    @assert stride in [1, 2] "`stride` has to be 1 or 2"
+    pad = @. (kernel_size - 1) ÷ 2
+    conv1 = (inplanes == hidden_planes) ? identity :
+            Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
+    selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
+    invres = Chain(conv1,
+                   conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
+                           bias = false, stride, pad = pad, groups = hidden_planes)...,
+                   selayer,
+                   conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
+    return (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+end
 
-  (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+function invertedresidual(kernel_size::Integer, args...; kwargs...)
+    return invertedresidual((kernel_size, kernel_size), args...; kwargs...)
 end
-invertedresidual(kernel_size::Integer, args...; kwargs...) =
-  invertedresidual((kernel_size, kernel_size), args...; kwargs...)
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 06116bdc2..7c0d4f7e6 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -5,31 +5,31 @@ _flatten_spatial(x) = permutedims(reshape(x, (:, size(x, 3), size(x, 4))), (2, 1
                    patch_size::Dims{2} = (16, 16), embedplanes = 768,
                    norm_layer = planes -> identity, flatten = true)
 
-Patch embedding layer used by many vision transformer-like models to split the input image into 
+Patch embedding layer used by many vision transformer-like models to split the input image into
 patches.
 
 # Arguments:
-- `imsize`: the size of the input image
-- `inchannels`: the number of channels in the input image
-- `patch_size`: the size of the patches
-- `embedplanes`: the number of channels in the embedding
-- `norm_layer`: the normalization layer - by default the identity function but otherwise takes a
-                single argument constructor for a normalization layer like LayerNorm or BatchNorm
-- `flatten`: set true to flatten the input spatial dimensions after the embedding
+
+  - `imsize`: the size of the input image
+  - `inchannels`: the number of channels in the input image
+  - `patch_size`: the size of the patches
+  - `embedplanes`: the number of channels in the embedding
+  - `norm_layer`: the normalization layer - by default the identity function but otherwise takes a
+    single argument constructor for a normalization layer like LayerNorm or BatchNorm
+  - `flatten`: set true to flatten the input spatial dimensions after the embedding
 """
 function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                         patch_size::Dims{2} = (16, 16), embedplanes = 768,
                         norm_layer = planes -> identity, flatten = true)
+    im_height, im_width = imsize
+    patch_height, patch_width = patch_size
 
-  im_height, im_width = imsize
-  patch_height, patch_width = patch_size
-
-  @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
-  "Image dimensions must be divisible by the patch size."
+    @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
+    "Image dimensions must be divisible by the patch size."
 
-  return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
-               flatten ? _flatten_spatial : identity,
-               norm_layer(embedplanes))
+    return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
+                 flatten ? _flatten_spatial : identity,
+                 norm_layer(embedplanes))
 end
 
 """
@@ -38,11 +38,13 @@ end
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
-  vectors::T
+    vectors::T
 end
 
-ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) =
-  ViPosEmbedding(init((embedsize, npatches)))
+function ViPosEmbedding(embedsize::Integer, npatches::Integer;
+                        init = (dims::Dims{2}) -> rand(Float32, dims))
+    return ViPosEmbedding(init((embedsize, npatches)))
+end
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
 
@@ -54,7 +56,7 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) ->
 Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models.
 """
 struct ClassTokens{T}
-  token::T
+    token::T
 end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index ca8f38f97..25ead874b 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -5,16 +5,17 @@
 Feedforward block used in many MLPMixer-like and vision-transformer models.
 
 # Arguments
-- `inplanes`: Number of dimensions in the input.
-- `hidden_planes`: Number of dimensions in the intermediate layer.
-- `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
-- `dropout`: Dropout rate.
-- `activation`: Activation function to use.
+
+  - `inplanes`: Number of dimensions in the input.
+  - `hidden_planes`: Number of dimensions in the intermediate layer.
+  - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
+  - `dropout`: Dropout rate.
+  - `activation`: Activation function to use.
 """
-function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
-                   dropout = 0., activation = gelu)
-  Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
-        Dense(hidden_planes, outplanes), Dropout(dropout))
+function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes;
+                   dropout = 0.0, activation = gelu)
+    return Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
+                 Dense(hidden_planes, outplanes), Dropout(dropout))
 end
 
 """
@@ -25,20 +26,21 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP
 ([reference](https://arxiv.org/abs/2105.08050))
 
 # Arguments
-- `gate_layer`: Layer to use for the gating.
-- `inplanes`: Number of dimensions in the input.
-- `hidden_planes`: Number of dimensions in the intermediate layer.
-- `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
-- `dropout`: Dropout rate.
-- `activation`: Activation function to use.
+
+  - `gate_layer`: Layer to use for the gating.
+  - `inplanes`: Number of dimensions in the input.
+  - `hidden_planes`: Number of dimensions in the intermediate layer.
+  - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`.
+  - `dropout`: Dropout rate.
+  - `activation`: Activation function to use.
 """
 function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer,
-                         outplanes::Integer = inplanes; dropout = 0., activation = gelu)
-  @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP"
-  return Chain(Dense(inplanes, hidden_planes, activation),
-               Dropout(dropout),
-               gate_layer(hidden_planes),
-               Dense(hidden_planes ÷ 2, outplanes),
-               Dropout(dropout))
+                         outplanes::Integer = inplanes; dropout = 0.0, activation = gelu)
+    @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP"
+    return Chain(Dense(inplanes, hidden_planes, activation),
+                 Dropout(dropout),
+                 gate_layer(hidden_planes),
+                 Dense(hidden_planes ÷ 2, outplanes),
+                 Dropout(dropout))
 end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index a7bce3e6c..4f69dab03 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -5,23 +5,23 @@ prenorm(planes, fn) = Chain(LayerNorm(planes), fn)
     ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
 
 A variant of LayerNorm where the input is normalised along the
-channel dimension. The input is expected to have channel dimension with size 
+channel dimension. The input is expected to have channel dimension with size
 `sz`. It also applies a learnable shift and rescaling after the normalization.
 
 Note that this is specifically for inputs with 4 dimensions in the format
 (H, W, C, N) where H, W are the height and width of the input, C is the number
 of channels, and N is the batch size.
 """
-struct ChannelLayerNorm{D,T}
-  diag::D
-  ϵ::T
+struct ChannelLayerNorm{D, T}
+    diag::D
+    ϵ::T
 end
 
 @functor ChannelLayerNorm
 
-(m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ))
+(m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x; dims = ndims(x) - 1, ϵ = m.ϵ))
 
-function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
-  diag = Flux.Scale(1, 1, sz, λ)
-  return ChannelLayerNorm(diag, ϵ)
+function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5)
+    diag = Flux.Scale(1, 1, sz, λ)
+    return ChannelLayerNorm(diag, ϵ)
 end
diff --git a/src/layers/others.jl b/src/layers/others.jl
index 366b273e4..770bccebd 100644
--- a/src/layers/others.jl
+++ b/src/layers/others.jl
@@ -5,11 +5,13 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`"
 ([reference](https://arxiv.org/abs/2103.17239)).
 
 # Arguments
-- `planes`: Size of channel dimension in the input.
-- `λ`: initialisation value for the learnable diagonal matrix.
+
+  - `planes`: Size of channel dimension in the input.
+  - `λ`: initialisation value for the learnable diagonal matrix.
 """
-LayerScale(planes::Integer, λ) =
-    λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity
+function LayerScale(planes::Integer, λ)
+    return λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity
+end
 
 """
     DropPath(p)
@@ -18,6 +20,7 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥
 ([reference](https://arxiv.org/abs/1603.09382))
 
 # Arguments
-- `p`: rate of Stochastic Depth.
+
+  - `p`: rate of Stochastic Depth.
 """
-DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
\ No newline at end of file
+DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl
index 880486dc2..942abc823 100644
--- a/src/other/mlpmixer.jl
+++ b/src/other/mlpmixer.jl
@@ -6,26 +6,27 @@ Creates a feedforward block for the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601))
 
 # Arguments:
-- `planes`: the number of planes in the block
-- `npatches`: the number of patches of the input
-- `mlp_ratio`: number(s) that determine(s) the number of hidden channels in the token mixing MLP 
-               and/or the channel mixing MLP as a ratio to the number of planes in the block.
-- `mlp_layer`: the MLP layer to use in the block
-- `dropout`: the dropout rate to use in the MLP blocks
-- `drop_path_rate`: Stochastic depth rate
-- `activation`: the activation function to use in the MLP blocks
+
+  - `planes`: the number of planes in the block
+  - `npatches`: the number of patches of the input
+  - `mlp_ratio`: number(s) that determine(s) the number of hidden channels in the token mixing MLP
+    and/or the channel mixing MLP as a ratio to the number of planes in the block.
+  - `mlp_layer`: the MLP layer to use in the block
+  - `dropout`: the dropout rate to use in the MLP blocks
+  - `drop_path_rate`: Stochastic depth rate
+  - `activation`: the activation function to use in the MLP blocks
 """
-function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, 
-                    dropout = 0., drop_path_rate = 0., activation = gelu)
-  tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
-  return Chain(SkipConnection(Chain(LayerNorm(planes),
-                                    swapdims((2, 1, 3)),
-                                    mlp_layer(npatches, tokenplanes; activation, dropout),
-                                    swapdims((2, 1, 3)),
-                                    DropPath(drop_path_rate)), +),
-               SkipConnection(Chain(LayerNorm(planes),
-                                    mlp_layer(planes, channelplanes; activation, dropout),
-                                    DropPath(drop_path_rate)), +))
+function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block,
+                    dropout = 0.0, drop_path_rate = 0.0, activation = gelu)
+    tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
+    return Chain(SkipConnection(Chain(LayerNorm(planes),
+                                      swapdims((2, 1, 3)),
+                                      mlp_layer(npatches, tokenplanes; activation, dropout),
+                                      swapdims((2, 1, 3)),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(LayerNorm(planes),
+                                      mlp_layer(planes, channelplanes; activation, dropout),
+                                      DropPath(drop_path_rate)), +))
 end
 
 """
@@ -37,40 +38,44 @@ Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
 
 # Arguments
-- `block`: the type of mixer block to use in the model - architecture dependent
-           (a constructor of the form `block(embedplanes, npatches; drop_path_rate, kwargs...)`)
-- `imsize`: the size of the input image
-- `inchannels`: the number of input channels
-- `norm_layer`: the normalization layer to use in the model
-- `patch_size`: the size of the patches
-- `embedplanes`: the number of channels after the patch embedding (denotes the hidden dimension)
-- `drop_path_rate`: Stochastic depth rate
-- `depth`: the number of blocks in the model
-- `nclasses`: number of output classes
-- `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if 
-            not specified.
+
+  - `block`: the type of mixer block to use in the model - architecture dependent
+    (a constructor of the form `block(embedplanes, npatches; drop_path_rate, kwargs...)`)
+  - `imsize`: the size of the input image
+  - `inchannels`: the number of input channels
+  - `norm_layer`: the normalization layer to use in the model
+  - `patch_size`: the size of the patches
+  - `embedplanes`: the number of channels after the patch embedding (denotes the hidden dimension)
+  - `drop_path_rate`: Stochastic depth rate
+  - `depth`: the number of blocks in the model
+  - `nclasses`: number of output classes
+  - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if
+    not specified.
 """
-function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm,
-                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
+function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3,
+                  norm_layer = LayerNorm,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
                   depth = 12, nclasses = 1000, kwargs...)
-  npatches = prod(imsize .÷ patch_size)
-  dp_rates = LinRange{Float32}(0., drop_path_rate, depth)
-  layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                 Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...)
-                  for i in 1:depth]))
-
-  classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
-  return Chain(layers, classification_head)
+    npatches = prod(imsize .÷ patch_size)
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth)
+    layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                   Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i],
+                                kwargs...)
+                          for i in 1:depth]))
+
+    classification_head = Chain(norm_layer(embedplanes), seconddimmean,
+                                Dense(embedplanes, nclasses))
+    return Chain(layers, classification_head)
 end
 
 # Configurations for MLPMixer models
-mixer_configs = Dict(:small => Dict(:depth => 8,  :planes => 512),
-                     :base  => Dict(:depth => 12, :planes => 768),
+mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512),
+                     :base => Dict(:depth => 12, :planes => 768),
                      :large => Dict(:depth => 24, :planes => 1024),
-                     :huge  => Dict(:depth => 32, :planes => 1280))
+                     :huge => Dict(:depth => 32, :planes => 1280))
 
 struct MLPMixer
-  layers
+    layers::Any
 end
 
 """
@@ -81,21 +86,23 @@ Creates a model with the MLPMixer architecture.
 ([reference](https://arxiv.org/pdf/2105.01601)).
 
 # Arguments
-- `size`: the size of the model - one of `small`, `base`, `large` or `huge`
-- `patch_size`: the size of the patches
-- `imsize`: the size of the input image
-- `drop_path_rate`: Stochastic depth rate
-- `nclasses`: number of output classes
+
+  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `patch_size`: the size of the patches
+  - `imsize`: the size of the input image
+  - `drop_path_rate`: Stochastic depth rate
+  - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
 function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                  imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses)
-  MLPMixer(layers)
+                  imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate,
+                      nclasses)
+    return MLPMixer(layers)
 end
 
 @functor MLPMixer
@@ -113,32 +120,34 @@ Creates a block for the ResMixer architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
 
 # Arguments
-- `planes`: the number of planes in the block
-- `npatches`: the number of patches of the input
-- `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
-               of planes in the block
-- `mlp_layer`: the MLP block to use
-- `dropout`: the dropout rate to use in the MLP blocks
-- `drop_path_rate`: Stochastic depth rate
-- `activation`: the activation function to use in the MLP blocks
-- `λ`: initialisation constant for the LayerScale
+
+  - `planes`: the number of planes in the block
+  - `npatches`: the number of patches of the input
+  - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
+    of planes in the block
+  - `mlp_layer`: the MLP block to use
+  - `dropout`: the dropout rate to use in the MLP blocks
+  - `drop_path_rate`: Stochastic depth rate
+  - `activation`: the activation function to use in the MLP blocks
+  - `λ`: initialisation constant for the LayerScale
 """
 function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block,
-                       dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4)
-return Chain(SkipConnection(Chain(Flux.Scale(planes),
-                                  swapdims((2, 1, 3)),
-                                  Dense(npatches, npatches),
-                                  swapdims((2, 1, 3)),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +),
-             SkipConnection(Chain(Flux.Scale(planes),
-                                  mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +))
+                       dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4)
+    return Chain(SkipConnection(Chain(Flux.Scale(planes),
+                                      swapdims((2, 1, 3)),
+                                      Dense(npatches, npatches),
+                                      swapdims((2, 1, 3)),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(Flux.Scale(planes),
+                                      mlp_layer(planes, Int(mlp_ratio * planes); dropout,
+                                                activation),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +))
 end
 
 struct ResMLP
-  layers
+    layers::Any
 end
 
 """
@@ -149,22 +158,23 @@ Creates a model with the ResMLP architecture.
 ([reference](https://arxiv.org/abs/2105.03404)).
 
 # Arguments
-- `size`: the size of the model - one of `small`, `base`, `large` or `huge`
-- `patch_size`: the size of the patches
-- `imsize`: the size of the input image
-- `drop_path_rate`: Stochastic depth rate
-- `nclasses`: number of output classes
+
+  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `patch_size`: the size of the patches
+  - `imsize`: the size of the input image
+  - `drop_path_rate`: Stochastic depth rate
+  - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
 function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                    drop_path_rate, depth, nclasses)
-  ResMLP(layers)
+                imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
+                      drop_path_rate, depth, nclasses)
+    return ResMLP(layers)
 end
 
 @functor ResMLP
@@ -179,14 +189,15 @@ classifier(m::ResMLP) = m.layers[2]
 
 Creates a spatial gating unit as described in the gMLP paper.
 ([reference](https://arxiv.org/abs/2105.08050))
-    
+
 # Arguments
-- `norm`: the normalisation layer to use
-- `proj`: the projection layer to use
+
+  - `norm`: the normalisation layer to use
+  - `proj`: the projection layer to use
 """
 struct SpatialGatingUnit{T, F}
-  norm::T
-  proj::F
+    norm::T
+    proj::F
 end
 
 """
@@ -196,24 +207,25 @@ Creates a spatial gating unit as described in the gMLP paper.
 ([reference](https://arxiv.org/abs/2105.08050))
 
 # Arguments
-- `planes`: the number of planes in the block
-- `npatches`: the number of patches of the input
-- `norm_layer`: the normalisation layer to use
+
+  - `planes`: the number of planes in the block
+  - `npatches`: the number of patches of the input
+  - `norm_layer`: the normalisation layer to use
 """
 function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm)
-  gateplanes = planes ÷ 2
-  norm = norm_layer(gateplanes)
-  proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
-  return SpatialGatingUnit(norm, proj)
+    gateplanes = planes ÷ 2
+    norm = norm_layer(gateplanes)
+    proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
+    return SpatialGatingUnit(norm, proj)
 end
 
 @functor SpatialGatingUnit
 
 function (m::SpatialGatingUnit)(x)
-  u, v = chunk(x, 2; dims = 1)
-  v = m.norm(v)
-  v = m.proj(permutedims(v, (2, 1, 3)))
-  return u .* permutedims(v, (2, 1, 3))
+    u, v = chunk(x, 2; dims = 1)
+    v = m.norm(v)
+    v = m.proj(permutedims(v, (2, 1, 3)))
+    return u .* permutedims(v, (2, 1, 3))
 end
 
 """
@@ -225,27 +237,29 @@ Creates a feedforward block based on the gMLP model architecture described in th
 ([reference](https://arxiv.org/abs/2105.08050))
 
 # Arguments
-- `planes`: the number of planes in the block
-- `npatches`: the number of patches of the input
-- `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
-                of planes in the block
-- `norm_layer`: the normalisation layer to use
-- `dropout`: the dropout rate to use in the MLP blocks
-- `drop_path_rate`: Stochastic depth rate
-- `activation`: the activation function to use in the MLP blocks
+
+  - `planes`: the number of planes in the block
+  - `npatches`: the number of patches of the input
+  - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number
+    of planes in the block
+  - `norm_layer`: the normalisation layer to use
+  - `dropout`: the dropout rate to use in the MLP blocks
+  - `drop_path_rate`: Stochastic depth rate
+  - `activation`: the activation function to use in the MLP blocks
 """
 function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm,
-                              mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0.,
+                              mlp_layer = gated_mlp_block, dropout = 0.0,
+                              drop_path_rate = 0.0,
                               activation = gelu)
-  channelplanes = Int(mlp_ratio * planes)
-  sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
-  return SkipConnection(Chain(norm_layer(planes),
-                              mlp_layer(sgu, planes, channelplanes; activation, dropout),
-                              DropPath(drop_path_rate)), +)
+    channelplanes = Int(mlp_ratio * planes)
+    sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
+    return SkipConnection(Chain(norm_layer(planes),
+                                mlp_layer(sgu, planes, channelplanes; activation, dropout),
+                                DropPath(drop_path_rate)), +)
 end
 
 struct gMLP
-  layers
+    layers::Any
 end
 
 """
@@ -256,23 +270,23 @@ Creates a model with the gMLP architecture.
 ([reference](https://arxiv.org/abs/2105.08050)).
 
 # Arguments
-- `size`: the size of the model - one of `small`, `base`, `large` or `huge`
-- `patch_size`: the size of the patches
-- `imsize`: the size of the input image
-- `drop_path_rate`: Stochastic depth rate
-- `nclasses`: number of output classes
+
+  - `size`: the size of the model - one of `small`, `base`, `large` or `huge`
+  - `patch_size`: the size of the patches
+  - `imsize`: the size of the input image
+  - `drop_path_rate`: Stochastic depth rate
+  - `nclasses`: number of output classes
 
 See also [`Metalhead.mlpmixer`](#).
 """
 function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-              imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
-                    patch_size, embedplanes, drop_path_rate, depth, nclasses)
-
-  gMLP(layers)
+              imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
+                      patch_size, embedplanes, drop_path_rate, depth, nclasses)
+    return gMLP(layers)
 end
 
 @functor gMLP
diff --git a/src/pretrain.jl b/src/pretrain.jl
index 97ab7398e..24e6d176d 100644
--- a/src/pretrain.jl
+++ b/src/pretrain.jl
@@ -4,17 +4,17 @@
 Load the pre-trained weights for `model` using the stored artifacts.
 """
 function weights(model)
-  try
-    path = joinpath(@artifact_str(model), "$model.bson")
-    artifact = BSON.load(path, @__MODULE__)
-    if haskey(artifact, :model)
-      return artifact[:model]
-    else
-      throw(ArgumentError("No pre-trained weights available for $model."))
+    try
+        path = joinpath(@artifact_str(model), "$model.bson")
+        artifact = BSON.load(path, @__MODULE__)
+        if haskey(artifact, :model)
+            return artifact[:model]
+        else
+            throw(ArgumentError("No pre-trained weights available for $model."))
+        end
+    catch e
+        throw(ArgumentError("No pre-trained weights available for $model."))
     end
-  catch e
-    throw(ArgumentError("No pre-trained weights available for $model."))
-  end
 end
 
 """
diff --git a/src/utilities.jl b/src/utilities.jl
index 39dbdd3b2..dd3f2ed74 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -1,12 +1,12 @@
 # Utility function for classifier head of vision transformer-like models
-seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2)
+seconddimmean(x) = dropdims(mean(x; dims = 2); dims = 2)
 
 # utility function for making sure that all layers have a channel size divisible by 8
 # used by MobileNet variants
 function _round_channels(channels, divisor, min_value = divisor)
-  new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
-  # Make sure that round down does not go down by more than 10%
-  return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
+    new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
+    # Make sure that round down does not go down by more than 10%
+    return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
 end
 
 """
@@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm)
 
 # Utility function for pretty printing large models
 function _maybe_big_show(io, model)
-  if isdefined(Flux, :_big_show)
-    if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
-      Flux._big_show(io, model)
-    else
-      show(io, model)
+    if isdefined(Flux, :_big_show)
+        if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
+            Flux._big_show(io, model)
+        else
+            show(io, model)
+        end
     end
-  end
 end
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 55b3e3d30..dffc93ccf 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -1,23 +1,26 @@
 """
-  transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.)
+transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.)
 
 Transformer as used in the base ViT architecture.
 ([reference](https://arxiv.org/abs/2010.11929)).
 
 # Arguments
-- `planes`: number of input channels
-- `depth`: number of attention blocks
-- `nheads`: number of attention heads
-- `mlp_ratio`: ratio of MLP layers to the number of input channels
-- `dropout`: dropout rate
+
+  - `planes`: number of input channels
+  - `depth`: number of attention blocks
+  - `nheads`: number of attention heads
+  - `mlp_ratio`: ratio of MLP layers to the number of input channels
+  - `dropout`: dropout rate
 """
-function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.)
-  layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout,
-                                                             proj_drop = dropout)), +),
-                  SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes);
-                                                           dropout)), +))
-            for _ in 1:depth]
-  Chain(layers)
+function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0)
+    layers = [Chain(SkipConnection(prenorm(planes,
+                                           MHAttention(planes, nheads; attn_drop = dropout,
+                                                       proj_drop = dropout)), +),
+                    SkipConnection(prenorm(planes,
+                                           mlp_block(planes, floor(Int, mlp_ratio * planes);
+                                                     dropout)), +))
+              for _ in 1:depth]
+    return Chain(layers)
 end
 
 """
@@ -29,32 +32,32 @@ Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
 
 # Arguments
-- `imsize`: image size
-- `inchannels`: number of input channels
-- `patch_size`: size of the patches
-- `embedplanes`: the number of channels after the patch embedding
-- `depth`: number of blocks in the transformer
-- `nheads`: number of attention heads in the transformer
-- `mlpplanes`: number of hidden channels in the MLP block in the transformer
-- `dropout`: dropout rate
-- `emb_dropout`: dropout rate for the positional embedding layer
-- `pool`: pooling type, either :class or :mean
-- `nclasses`: number of classes in the output
+
+  - `imsize`: image size
+  - `inchannels`: number of input channels
+  - `patch_size`: size of the patches
+  - `embedplanes`: the number of channels after the patch embedding
+  - `depth`: number of blocks in the transformer
+  - `nheads`: number of attention heads in the transformer
+  - `mlpplanes`: number of hidden channels in the MLP block in the transformer
+  - `dropout`: dropout rate
+  - `emb_dropout`: dropout rate for the positional embedding layer
+  - `pool`: pooling type, either :class or :mean
+  - `nclasses`: number of classes in the output
 """
 function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
              embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1,
              emb_dropout = 0.1, pool = :class, nclasses = 1000)
-
-  @assert pool in [:class, :mean]
-  "Pool type must be either :class (class token) or :mean (mean pooling)"
-  npatches = prod(imsize .÷ patch_size)
-  return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                     ClassTokens(embedplanes),
-                     ViPosEmbedding(embedplanes, npatches + 1),
-                     Dropout(emb_dropout),
-                     transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
-                     (pool == :class) ? x -> x[:, 1, :] : seconddimmean),
-               Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
+    @assert pool in [:class, :mean]
+    "Pool type must be either :class (class token) or :mean (mean pooling)"
+    npatches = prod(imsize .÷ patch_size)
+    return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                       ClassTokens(embedplanes),
+                       ViPosEmbedding(embedplanes, npatches + 1),
+                       Dropout(emb_dropout),
+                       transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
+                       (pool == :class) ? x -> x[:, 1, :] : seconddimmean),
+                 Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
 
 vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
@@ -62,8 +65,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                    :base => (depth = 12, embedplanes = 768, nheads = 12),
                    :large => (depth = 24, embedplanes = 1024, nheads = 16),
                    :huge => (depth = 32, embedplanes = 1280, nheads = 16),
-                   :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11),
-                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13))
+                   :giant => (depth = 40, embedplanes = 1408, nheads = 16,
+                              mlp_ratio = 48 // 11),
+                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16,
+                                 mlp_ratio = 64 // 13))
 
 """
     ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3,
@@ -73,26 +78,27 @@ Creates a Vision Transformer (ViT) model.
 ([reference](https://arxiv.org/abs/2010.11929)).
 
 # Arguments
-- `mode`: the model configuration, one of [:tiny, :small, :base, :large, :huge, :giant, :gigantic]
-- `imsize`: image size
-- `inchannels`: number of input channels
-- `patch_size`: size of the patches
-- `pool`: pooling type, either :class or :mean
-- `nclasses`: number of classes in the output
+
+  - `mode`: the model configuration, one of [:tiny, :small, :base, :large, :huge, :giant, :gigantic]
+  - `imsize`: image size
+  - `inchannels`: number of input channels
+  - `patch_size`: size of the patches
+  - `pool`: pooling type, either :class or :mean
+  - `nclasses`: number of classes in the output
 
 See also [`Metalhead.vit`](#).
 """
 struct ViT
-  layers
+    layers::Any
 end
 
 function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3,
              patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
-  @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
-  kwargs = vit_configs[mode]
-  layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
+    @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
+    kwargs = vit_configs[mode]
+    layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
 
-  ViT(layers)
+    return ViT(layers)
 end
 
 (m::ViT)(x) = m.layers(x)
diff --git a/test/convnets.jl b/test/convnets.jl
index 3540c3e9f..3a26477cf 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -5,202 +5,194 @@ using Flux
 PRETRAINED_MODELS = []
 
 @testset "AlexNet" begin
-  model = AlexNet()
-  @test size(model(x_256)) == (1000, 1)
-  @test_throws ArgumentError AlexNet(pretrain = true)
-  @test gradtest(model, x_256)
+    model = AlexNet()
+    @test size(model(x_256)) == (1000, 1)
+    @test_throws ArgumentError AlexNet(pretrain = true)
+    @test gradtest(model, x_256)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "VGG" begin
-  @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false]
-    m = VGG(sz, batchnorm = bn)
-
-    @test size(m(x_224)) == (1000, 1)
-    if (VGG, sz, bn) in PRETRAINED_MODELS
-      @test (VGG(sz, batchnorm = bn, pretrain = true); true)
-    else
-      @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+    @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false]
+        m = VGG(sz, batchnorm = bn)
+        @test size(m(x_224)) == (1000, 1)
+        if (VGG, sz, bn) in PRETRAINED_MODELS
+            @test (VGG(sz, batchnorm = bn, pretrain = true); true)
+        else
+            @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-  end
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "ResNet" begin
-  @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
-    m = ResNet(sz)
-
-    @test size(m(x_256)) == (1000, 1)
-    if (ResNet, sz) in PRETRAINED_MODELS
-      @test (ResNet(sz, pretrain = true); true)
-    else
-      @test_throws ArgumentError ResNet(sz, pretrain = true)
+    @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
+        m = ResNet(sz)
+        @test size(m(x_256)) == (1000, 1)
+        if (ResNet, sz) in PRETRAINED_MODELS
+            @test (ResNet(sz, pretrain = true); true)
+        else
+            @test_throws ArgumentError ResNet(sz, pretrain = true)
+        end
+        @test gradtest(m, x_256)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_256)
-    GC.safepoint()
-    GC.gc()
-  end
-
-  @testset "Shortcut C" begin
-    m = Metalhead.resnet(Metalhead.basicblock, :C;
-                         channel_config = [1, 1],
-                         block_config = [2, 2, 2, 2])
 
-    @test size(m(x_256)) == (1000, 1)
-    @test gradtest(m, x_256)
-  end
+    @testset "Shortcut C" begin
+        m = Metalhead.resnet(Metalhead.basicblock, :C;
+                            channel_config = [1, 1],
+                            block_config = [2, 2, 2, 2])
+        @test size(m(x_256)) == (1000, 1)
+        @test gradtest(m, x_256)
+    end
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "ResNeXt" begin
-  @testset for depth in [50, 101, 152]
-    m = ResNeXt(depth)
-
-    @test size(m(x_224)) == (1000, 1)
-    if ResNeXt in PRETRAINED_MODELS
-      @test (ResNeXt(depth, pretrain = true); true)
-    else
-      @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+    @testset for depth in [50, 101, 152]
+        m = ResNeXt(depth)
+        @test size(m(x_224)) == (1000, 1)
+        if ResNeXt in PRETRAINED_MODELS
+            @test (ResNeXt(depth, pretrain = true); true)
+        else
+            @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-  end
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "GoogLeNet" begin
-  m = GoogLeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = GoogLeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "Inception3" begin
-  m = Inception3()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError Inception3(pretrain = true)
-  @test gradtest(m, x_224)
+    m = Inception3()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError Inception3(pretrain = true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "SqueezeNet" begin
-  m = SqueezeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = SqueezeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "DenseNet" begin
-  @testset for sz in [121, 161, 169, 201]
-    m = DenseNet(sz)
-
-    @test size(m(x_224)) == (1000, 1)
-    if (DenseNet, sz) in PRETRAINED_MODELS
-      @test (DenseNet(sz, pretrain = true); true)
-    else
-      @test_throws ArgumentError DenseNet(sz, pretrain = true)
+    @testset for sz in [121, 161, 169, 201]
+        m = DenseNet(sz)
+        @test size(m(x_224)) == (1000, 1)
+        if (DenseNet, sz) in PRETRAINED_MODELS
+            @test (DenseNet(sz, pretrain = true); true)
+        else
+            @test_throws ArgumentError DenseNet(sz, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-  end
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "MobileNet" verbose = true begin
-  @testset "MobileNetv1" begin
-    m = MobileNetv1()
-
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv1 in PRETRAINED_MODELS
-      @test (MobileNetv1(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv1(pretrain = true)
+    @testset "MobileNetv1" begin
+        m = MobileNetv1()
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv1 in PRETRAINED_MODELS
+            @test (MobileNetv1(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv1(pretrain = true)
+        end
+        @test gradtest(m, x_224)
     end
-    @test gradtest(m, x_224)
-  end
 
-  GC.safepoint()
-  GC.gc()
+    GC.safepoint()
+    GC.gc()
+
+    @testset "MobileNetv2" begin
+        m = MobileNetv2()
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv2 in PRETRAINED_MODELS
+            @test (MobileNetv2(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv2(pretrain = true)
+        end
+        @test gradtest(m, x_224)
+    end
 
-  @testset "MobileNetv2" begin
-    m = MobileNetv2()
+    GC.safepoint()
+    GC.gc()
 
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv2 in PRETRAINED_MODELS
-      @test (MobileNetv2(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv2(pretrain = true)
+    @testset "MobileNetv3" verbose = true begin
+        @testset for mode in [:small, :large]
+            m = MobileNetv3(mode)
+
+            @test size(m(x_224)) == (1000, 1)
+            if MobileNetv3 in PRETRAINED_MODELS
+                @test (MobileNetv3(mode; pretrain = true); true)
+            else
+                @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+            end
+            @test gradtest(m, x_224)
+        end
     end
-    @test gradtest(m, x_224)
-  end
-
-  GC.safepoint()
-  GC.gc()
-
-  @testset "MobileNetv3" verbose = true begin
-    @testset for mode in [:small, :large]
-      m = MobileNetv3(mode)
-
-      @test size(m(x_224)) == (1000, 1)
-      if MobileNetv3 in PRETRAINED_MODELS
-        @test (MobileNetv3(mode; pretrain = true); true)
-      else
-        @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
-      end
-      @test gradtest(m, x_224)
     end
-  end
-end
-
-GC.safepoint()
-GC.gc()
 
-@testset "ConvNeXt" verbose = true begin
-  @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
-    @testset for drop_path_rate in [0.0, 0.5]
-      m = ConvNeXt(mode; drop_path_rate)
+    GC.safepoint()
+    GC.gc()
 
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
-    end
-  end
+    @testset "ConvNeXt" verbose = true begin
+        @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
+            @testset for drop_path_rate in [0.0, 0.5]
+                m = ConvNeXt(mode; drop_path_rate)
+                @test size(m(x_224)) == (1000, 1)
+                @test gradtest(m, x_224)
+                GC.safepoint()
+                GC.gc()
+            end
+        end
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "ConvMixer" verbose = true begin
-  @testset for mode in [:small, :base, :large]
-    m = ConvMixer(mode)
+    @testset for mode in [:small, :base, :large]
+        m = ConvMixer(mode)
 
-    @test size(m(x_224)) == (1000, 1)
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-  end
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
+    end
 end
diff --git a/test/other.jl b/test/other.jl
index 0162bc4bc..ae964d6d1 100644
--- a/test/other.jl
+++ b/test/other.jl
@@ -2,37 +2,37 @@ using Metalhead, Test
 using Flux
 
 @testset "MLPMixer" begin
-  @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-      m = MLPMixer(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
-    end
-  end
+	@testset for mode in [:small, :base, :large] # :huge]
+		@testset for drop_path_rate in [0.0, 0.5]
+			m = MLPMixer(mode; drop_path_rate)
+			@test size(m(x_224)) == (1000, 1)
+			@test gradtest(m, x_224)
+			GC.safepoint()
+			GC.gc()
+		end
+	end
 end
 
 @testset "ResMLP" begin
   @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-      m = ResMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
-    end
+		@testset for drop_path_rate in [0.0, 0.5]
+			m = ResMLP(mode; drop_path_rate)
+			@test size(m(x_224)) == (1000, 1)
+			@test gradtest(m, x_224)
+			GC.safepoint()
+			GC.gc()
+		end
   end
 end
 
 @testset "gMLP" begin
-  @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-      m = gMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+    @testset for mode in [:small, :base, :large] # :huge]
+		@testset for drop_path_rate in [0.0, 0.5]
+			m = gMLP(mode; drop_path_rate)
+			@test size(m(x_224)) == (1000, 1)
+			@test gradtest(m, x_224)
+			GC.safepoint()
+			GC.gc()
+		end
     end
-  end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 6dd4a1aa4..610cbf40e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,11 +3,11 @@ using Flux
 using Flux: Zygote
 
 function gradtest(model, input)
-  y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
-  gs = pb(ones(Float32, size(y)))
+    y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
+    gs = pb(ones(Float32, size(y)))
 
-  # if we make it to here with no error, success!
-  return true
+    # if we make it to here with no error, success!
+    return true
 end
 
 x_224 = rand(Float32, 224, 224, 3, 1)
@@ -15,7 +15,7 @@ x_256 = rand(Float32, 256, 256, 3, 1)
 
 # CNN tests
 @testset verbose = true "ConvNets" begin
-  include("convnets.jl")
+    include("convnets.jl")
 end
 
 GC.safepoint()
@@ -23,7 +23,7 @@ GC.gc()
 
 # Other tests
 @testset verbose = true "Other" begin
-  include("other.jl")
+    include("other.jl")
 end
 
 GC.safepoint()
@@ -31,5 +31,5 @@ GC.gc()
 
 # ViT tests
 @testset verbose = true "ViTs" begin
-  include("vit-based.jl")
+    include("vit-based.jl")
 end
diff --git a/test/vit-based.jl b/test/vit-based.jl
index 20b6ecb86..cdaffc430 100644
--- a/test/vit-based.jl
+++ b/test/vit-based.jl
@@ -2,11 +2,11 @@ using Metalhead, Test
 using Flux
 
 @testset "ViT" begin
-  for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
-    m = ViT(mode)
-    @test size(m(x_256)) == (1000, 1)
-    @test gradtest(m, x_256)
-    GC.safepoint()
-    GC.gc()
-  end
+    for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
+        m = ViT(mode)
+        @test size(m(x_256)) == (1000, 1)
+        @test gradtest(m, x_256)
+        GC.safepoint()
+        GC.gc()
+    end
 end