From fd2869f57c66fa650547cd8581feeba9eda08b88 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 30 May 2022 06:19:37 +0530
Subject: [PATCH 1/8] Switch to SciML style for code

---
 .JuliaFormatter.toml       |   2 +
 src/Metalhead.jl           |  25 ++--
 src/convnets/alexnet.jl    |  43 +++---
 src/convnets/convmixer.jl  |  39 +++---
 src/convnets/convnext.jl   | 101 +++++++-------
 src/convnets/densenet.jl   |  82 ++++++------
 src/convnets/googlenet.jl  |  60 ++++-----
 src/convnets/inception.jl  | 191 ++++++++++++---------------
 src/convnets/mobilenet.jl  | 263 ++++++++++++++++++-------------------
 src/convnets/resnet.jl     | 121 ++++++++---------
 src/convnets/resnext.jl    |  94 ++++++-------
 src/convnets/squeezenet.jl |  57 ++++----
 src/convnets/vgg.jl        | 112 ++++++++--------
 src/layers/attention.jl    |  52 ++++----
 src/layers/conv.jl         | 134 ++++++++++---------
 src/layers/embeddings.jl   |  31 +++--
 src/layers/mlp.jl          |  22 ++--
 src/layers/normalise.jl    |  12 +-
 src/layers/others.jl       |   5 +-
 src/other/mlpmixer.jl      | 155 +++++++++++-----------
 src/pretrain.jl            |  20 +--
 src/utilities.jl           |  18 +--
 src/vit-based/vit.jl       |  53 ++++----
 test/convnets.jl           | 200 ++++++++++++++--------------
 test/other.jl              |  48 +++----
 test/runtests.jl           |  20 +--
 test/vit-based.jl          |   6 +-
 27 files changed, 972 insertions(+), 994 deletions(-)
 create mode 100644 .JuliaFormatter.toml

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 000000000..93a9e7665
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+style = "sciml"
+whitespace_in_kwargs = true
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index a0fb3785a..e465b6981 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -37,22 +37,23 @@ include("vit-based/vit.jl")
 
 include("pretrain.jl")
 
-export  AlexNet,
-        VGG, VGG11, VGG13, VGG16, VGG19,
-        ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
-        GoogLeNet, Inception3, SqueezeNet,
-        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
-        ResNeXt,
-        MobileNetv1, MobileNetv2, MobileNetv3,
-        MLPMixer, ResMLP, gMLP,
-        ViT,
-        ConvNeXt, ConvMixer
+export AlexNet,
+       VGG, VGG11, VGG13, VGG16, VGG19,
+       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
+       GoogLeNet, Inception3, SqueezeNet,
+       DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
+       ResNeXt,
+       MobileNetv1, MobileNetv2, MobileNetv3,
+       MLPMixer, ResMLP, gMLP,
+       ViT,
+       ConvNeXt, ConvMixer
 
 # use Flux._big_show to pretty print large models
-for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, 
+for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet,
+          :ResNeXt,
           :MobileNetv1, :MobileNetv2, :MobileNetv3,
           :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer)
-  @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
+    @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
 end
 
 end # module
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index ea3962c2a..93bf1cd67 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -8,23 +8,23 @@ Create an AlexNet model
 - `nclasses`: the number of output classes
 """
 function alexnet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((5, 5), 64 => 192, relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((3, 3), 192 => 384, relu, pad = (1, 1)),
-                       Conv((3, 3), 384 => 256, relu, pad = (1, 1)),
-                       Conv((3, 3), 256 => 256, relu, pad = (1, 1)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       AdaptiveMeanPool((6,6))),
-                 Chain(MLUtils.flatten,
-                       Dropout(0.5),
-                       Dense(256 * 6 * 6, 4096, relu),
-                       Dropout(0.5),
-                       Dense(4096, 4096, relu),
-                       Dense(4096, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         Conv((5, 5), 64 => 192, relu, pad = (2, 2)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         Conv((3, 3), 192 => 384, relu, pad = (1, 1)),
+                         Conv((3, 3), 384 => 256, relu, pad = (1, 1)),
+                         Conv((3, 3), 256 => 256, relu, pad = (1, 1)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         AdaptiveMeanPool((6, 6))),
+                   Chain(MLUtils.flatten,
+                         Dropout(0.5),
+                         Dense(256 * 6 * 6, 4096, relu),
+                         Dropout(0.5),
+                         Dense(4096, 4096, relu),
+                         Dense(4096, nclasses)))
+
+    return layers
 end
 
 """
@@ -41,14 +41,13 @@ See also [`alexnet`](#).
 - `nclasses`: the number of output classes
 """
 struct AlexNet
-  layers
+    layers::Any
 end
 
 function AlexNet(; pretrain = false, nclasses = 1000)
-  layers = alexnet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "AlexNet")
-
-  AlexNet(layers)
+    layers = alexnet(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "AlexNet")
+    AlexNet(layers)
 end
 
 @functor AlexNet
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index 01a6e61be..2a6aeae05 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -16,20 +16,24 @@ Creates a ConvMixer model.
 """
 function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
                    patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
-  stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1])
-  blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
-                                               preact = true, groups = planes, pad = SamePad())), +),
-                  conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth]
-  head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-  return Chain(Chain(stem..., Chain(blocks)), head)
+    stem = conv_bn(patch_size, inchannels, planes, activation; preact = true,
+                   stride = patch_size[1])
+    blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
+                                                 preact = true, groups = planes,
+                                                 pad = SamePad())), +),
+                    conv_bn((1, 1), planes, planes, activation; preact = true)...)
+              for _ in 1:depth]
+    head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
+    return Chain(Chain(stem..., Chain(blocks)), head)
 end
 
 convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)),
+                                      :patch_size => (7, 7)),
                         :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7),
-                                        :patch_size => (7, 7)),
-                        :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)))
+                                       :patch_size => (7, 7)),
+                        :large => Dict(:planes => 1024, :depth => 20,
+                                       :kernel_size => (9, 9),
+                                       :patch_size => (7, 7)))
 
 """
     ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
@@ -44,16 +48,17 @@ Creates a ConvMixer model.
 - `nclasses`: number of classes in the output
 """
 struct ConvMixer
-  layers
+    layers::Any
 end
 
 function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
-  planes = convmixer_config[mode][:planes]
-  depth = convmixer_config[mode][:depth]
-  kernel_size = convmixer_config[mode][:kernel_size]
-  patch_size = convmixer_config[mode][:patch_size]
-  layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses)
-  return ConvMixer(layers)
+    planes = convmixer_config[mode][:planes]
+    depth = convmixer_config[mode][:depth]
+    kernel_size = convmixer_config[mode][:kernel_size]
+    patch_size = convmixer_config[mode][:patch_size]
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
+                       nclasses)
+    return ConvMixer(layers)
 end
 
 @functor ConvMixer
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 1621803bf..0a44e7482 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -9,15 +9,15 @@ Creates a single block of ConvNeXt.
 - `drop_path_rate`: Stochastic depth rate.
 - `λ`: Init value for LayerScale
 """
-function convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
-  layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
-                                swapdims((3, 1, 2, 4)),
-                                LayerNorm(planes; ϵ = 1f-6),
-                                mlp_block(planes, 4 * planes),
-                                LayerScale(planes, λ),
-                                swapdims((2, 3, 1, 4)),
-                                DropPath(drop_path_rate)), +)
-  return layers
+function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+    layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
+                                  swapdims((3, 1, 2, 4)),
+                                  LayerNorm(planes; ϵ = 1.0f-6),
+                                  mlp_block(planes, 4 * planes),
+                                  LayerScale(planes, λ),
+                                  swapdims((2, 3, 1, 4)),
+                                  DropPath(drop_path_rate)), +)
+    return layers
 end
 
 """
@@ -34,45 +34,48 @@ Creates the layers for a ConvNeXt model.
 - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
 - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
-  @assert length(depths) == length(planes) "`planes` should have exactly one value for each block"
-
-  downsample_layers = []
-  stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
-               ChannelLayerNorm(planes[1]; ϵ = 1f-6))
-  push!(downsample_layers, stem)
-  for m in 1:length(depths) - 1
-    downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6),
-                              Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
-    push!(downsample_layers, downsample_layer)
-  end
-
-  stages = []
-  dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths))
-  cur = 0
-  for i in 1:length(depths)
-    push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
-    cur += depths[i]
-  end
-
-  backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
-  head = Chain(GlobalMeanPool(),
-               MLUtils.flatten,
-               LayerNorm(planes[end]),
-               Dense(planes[end], nclasses))
-
-  return Chain(Chain(backbone), head)
+function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
+                  nclasses = 1000)
+    @assert length(depths)==length(planes) "`planes` should have exactly one value for each block"
+    downsample_layers = []
+    stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
+                 ChannelLayerNorm(planes[1]; ϵ = 1.0f-6))
+    push!(downsample_layers, stem)
+    for m in 1:(length(depths) - 1)
+        downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6),
+                                 Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
+        push!(downsample_layers, downsample_layer)
+    end
+    stages = []
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths))
+    cur = 0
+    for i in 1:length(depths)
+        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        cur += depths[i]
+    end
+    backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
+    head = Chain(GlobalMeanPool(),
+                 MLUtils.flatten,
+                 LayerNorm(planes[end]),
+                 Dense(planes[end], nclasses))
+
+    return Chain(Chain(backbone), head)
 end
 
 # Configurations for ConvNeXt models
-convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]),
-                        :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]),
-                        :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]),
-                        :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]),
-                        :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048]))
+convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3],
+                                      :planes => [96, 192, 384, 768]),
+                        :small => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [96, 192, 384, 768]),
+                        :base => Dict(:depths => [3, 3, 27, 3],
+                                      :planes => [128, 256, 512, 1024]),
+                        :large => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [192, 384, 768, 1536]),
+                        :xlarge => Dict(:depths => [3, 3, 27, 3],
+                                        :planes => [256, 512, 1024, 2048]))
 
 struct ConvNeXt
-  layers
+    layers::Any
 end
 
 """
@@ -89,13 +92,13 @@ Creates a ConvNeXt model.
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, 
+function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
                   nclasses = 1000)
-  @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
-  depths = convnext_configs[mode][:depths]
-  planes = convnext_configs[mode][:planes]
-  layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
-  return ConvNeXt(layers)
+    @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
+    depths = convnext_configs[mode][:depths]
+    planes = convnext_configs[mode][:planes]
+    layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
+    return ConvNeXt(layers)
 end
 
 (m::ConvNeXt)(x) = m.layers(x)
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index bda7a321d..be98509e6 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -10,11 +10,12 @@ Create a Densenet bottleneck layer
                (and scaling factor for inner feature maps; see ref)
 """
 function dense_bottleneck(inplanes, outplanes)
-  inner_channels = 4 * outplanes
-  m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
-            conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...)
+    inner_channels = 4 * outplanes
+    m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
+              conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false,
+                      rev = true)...)
 
-  SkipConnection(m, cat_channels)
+    SkipConnection(m, cat_channels)
 end
 
 """
@@ -27,8 +28,10 @@ Create a DenseNet transition sequence
 - `inplanes`: number of input feature maps
 - `outplanes`: number of output feature maps
 """
-transition(inplanes, outplanes) =
-  Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2)))
+function transition(inplanes, outplanes)
+    Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)...,
+          MeanPool((2, 2)))
+end
 
 """
     dense_block(inplanes, growth_rates)
@@ -42,8 +45,10 @@ the number of output feature maps by `growth_rates` with each block
 - `growth_rates`: the growth (additive) rates of output feature maps
                   after each block (a vector of `k`s from the ref)
 """
-dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o)
-  for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)]
+function dense_block(inplanes, growth_rates)
+    [dense_bottleneck(i, o)
+     for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)]
+end
 
 """
     densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
@@ -59,24 +64,24 @@ Create a DenseNet model
 - `nclasses`: the number of output classes
 """
 function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
-  push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
-
-  outplanes = 0
-  for (i, rates) in enumerate(growth_rates)
-    outplanes = inplanes + sum(rates)
-    append!(layers, dense_block(inplanes, rates))
-    (i != length(growth_rates)) &&
-      push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
-    inplanes = floor(Int, outplanes * reduction)
-  end
-  push!(layers, BatchNorm(outplanes, relu))
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)),
-                     MLUtils.flatten,
-                     Dense(outplanes, nclasses)))
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
+
+    outplanes = 0
+    for (i, rates) in enumerate(growth_rates)
+        outplanes = inplanes + sum(rates)
+        append!(layers, dense_block(inplanes, rates))
+        (i != length(growth_rates)) &&
+            push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
+        inplanes = floor(Int, outplanes * reduction)
+    end
+    push!(layers, BatchNorm(outplanes, relu))
+
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)),
+                       MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 """
@@ -91,9 +96,10 @@ Create a DenseNet model
 - `reduction`: the factor by which the number of feature maps is scaled across each transition
 - `nclasses`: the number of output classes
 """
-densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) =
-  densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-           reduction = reduction, nclasses = nclasses)
+function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
+             reduction = reduction, nclasses = nclasses)
+end
 
 """
     DenseNet(nblocks::NTuple{N, <:Integer};
@@ -110,16 +116,16 @@ See also [`densenet`](#).
 - `nclasses`: the number of output classes
 """
 struct DenseNet
-  layers
+    layers::Any
 end
 
 function DenseNet(nblocks::NTuple{N, <:Integer};
                   growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-  layers = densenet(nblocks; growth_rate = growth_rate,
-                             reduction = reduction,
-                             nclasses = nclasses)
+    layers = densenet(nblocks; growth_rate = growth_rate,
+                      reduction = reduction,
+                      nclasses = nclasses)
 
-  DenseNet(layers)
+    DenseNet(layers)
 end
 
 @functor DenseNet
@@ -148,11 +154,11 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 See also [`Metalhead.densenet`](#).
 """
 function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
-  @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
-  model = DenseNet(densenet_config[config]; nclasses = nclasses)
+    @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
+    model = DenseNet(densenet_config[config]; nclasses = nclasses)
 
-  pretrain && loadpretrain!(model, string("DenseNet", config))
-  return model
+    pretrain && loadpretrain!(model, string("DenseNet", config))
+    return model
 end
 
 # deprecations
diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl
index bc42a052f..40dd5ff41 100644
--- a/src/convnets/googlenet.jl
+++ b/src/convnets/googlenet.jl
@@ -15,16 +15,12 @@ Create an inception module for use in GoogLeNet
 """
 function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj)
     branch1 = Chain(Conv((1, 1), inplanes => out_1x1))
-
     branch2 = Chain(Conv((1, 1), inplanes => red_3x3),
                     Conv((3, 3), red_3x3 => out_3x3; pad = 1))
-
     branch3 = Chain(Conv((1, 1), inplanes => red_5x5),
-                    Conv((5, 5), red_5x5 => out_5x5; pad = 2)) 
-
-    branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1),
+                    Conv((5, 5), red_5x5 => out_5x5; pad = 2))
+    branch4 = Chain(MaxPool((3, 3), stride = 1, pad = 1),
                     Conv((1, 1), inplanes => pool_proj))
-
     return Parallel(cat_channels,
                     branch1, branch2, branch3, branch4)
 end
@@ -39,28 +35,27 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 - `nclasses`: the number of output classes
 """
 function googlenet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       Conv((1, 1), 64 => 64),
-                       Conv((3, 3), 64 => 192; pad = 1),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(192, 64, 96, 128, 16, 32, 32),
-                       _inceptionblock(256, 128, 128, 192, 32, 96, 64),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(480, 192, 96, 208, 16, 48, 64),
-                       _inceptionblock(512, 160, 112, 224, 24, 64, 64),
-                       _inceptionblock(512, 128, 128, 256, 24, 64, 64),
-                       _inceptionblock(512, 112, 144, 288, 32, 64, 64),
-                       _inceptionblock(528, 256, 160, 320, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(832, 256, 160, 320, 32, 128, 128),
-                       _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
-                 Chain(AdaptiveMeanPool((1, 1)),
-                       MLUtils.flatten,
-                       Dropout(0.4),
-                       Dense(1024, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         Conv((1, 1), 64 => 64),
+                         Conv((3, 3), 64 => 192; pad = 1),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(192, 64, 96, 128, 16, 32, 32),
+                         _inceptionblock(256, 128, 128, 192, 32, 96, 64),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(480, 192, 96, 208, 16, 48, 64),
+                         _inceptionblock(512, 160, 112, 224, 24, 64, 64),
+                         _inceptionblock(512, 128, 128, 256, 24, 64, 64),
+                         _inceptionblock(512, 112, 144, 288, 32, 64, 64),
+                         _inceptionblock(528, 256, 160, 320, 32, 128, 128),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(832, 256, 160, 320, 32, 128, 128),
+                         _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
+                   Chain(AdaptiveMeanPool((1, 1)),
+                         MLUtils.flatten,
+                         Dropout(0.4),
+                         Dense(1024, nclasses)))
+    return layers
 end
 
 """
@@ -79,14 +74,13 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 See also [`googlenet`](#).
 """
 struct GoogLeNet
-  layers
+    layers::Any
 end
 
 function GoogLeNet(; pretrain = false, nclasses = 1000)
-  layers = googlenet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "GoogLeNet")
-
-  GoogLeNet(layers)
+    layers = googlenet(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "GoogLeNet")
+    GoogLeNet(layers)
 end
 
 @functor GoogLeNet
diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl
index ef8ab81ef..2673d1b8e 100644
--- a/src/convnets/inception.jl
+++ b/src/convnets/inception.jl
@@ -9,20 +9,16 @@ Create an Inception-v3 style-A module
 - `pool_proj`: the number of output feature maps for the pooling projection
 """
 function inception_a(inplanes, pool_proj)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
-
-  branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
-                    conv_bn((5, 5), 48, 64; pad = 2)...)
-
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                    conv_bn((3, 3), 64, 96; pad = 1)...,
-                    conv_bn((3, 3), 96, 96; pad = 1)...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, pool_proj)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch5x5, branch3x3, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
+    branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
+                      conv_bn((5, 5), 48, 64; pad = 2)...)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                      conv_bn((3, 3), 64, 96; pad = 1)...,
+                      conv_bn((3, 3), 96, 96; pad = 1)...)
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, pool_proj)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch5x5, branch3x3, branch_pool)
 end
 
 """
@@ -35,16 +31,13 @@ Create an Inception-v3 style-B module
 - `inplanes`: number of input feature maps
 """
 function inception_b(inplanes)
-  branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                      conv_bn((3, 3), 64, 96; pad = 1)...,
-                      conv_bn((3, 3), 96, 96; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride = 2)
-
-  return Parallel(cat_channels,
-                  branch3x3_1, branch3x3_2, branch_pool)
+    branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                        conv_bn((3, 3), 64, 96; pad = 1)...,
+                        conv_bn((3, 3), 96, 96; stride = 2)...)
+    branch_pool = MaxPool((3, 3), stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3_1, branch3x3_2, branch_pool)
 end
 
 """
@@ -59,23 +52,19 @@ Create an Inception-v3 style-C module
 - `n`: the "grid size" (kernel size) for the convolution layers
 """
 function inception_c(inplanes, inner_planes, n = 7)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
-
-  branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
-
-  branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch7x7_1, branch7x7_2, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
+    branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
+    branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch7x7_1, branch7x7_2, branch_pool)
 end
 
 """
@@ -88,18 +77,15 @@ Create an Inception-v3 style-D module
 - `inplanes`: number of input feature maps
 """
 function inception_d(inplanes)
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                    conv_bn((3, 3), 192, 320; stride = 2)...)
-
-  branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                      conv_bn((1, 7), 192, 192; pad = (0, 3))...,
-                      conv_bn((7, 1), 192, 192; pad = (3, 0))...,
-                      conv_bn((3, 3), 192, 192; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride=2)
-
-  return Parallel(cat_channels,
-                  branch3x3, branch7x7x3, branch_pool)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                      conv_bn((3, 3), 192, 320; stride = 2)...)
+    branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                        conv_bn((1, 7), 192, 192; pad = (0, 3))...,
+                        conv_bn((7, 1), 192, 192; pad = (3, 0))...,
+                        conv_bn((3, 3), 192, 192; stride = 2)...)
+    branch_pool = MaxPool((3, 3), stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3, branch7x7x3, branch_pool)
 end
 
 """
@@ -112,30 +98,25 @@ Create an Inception-v3 style-E module
 - `inplanes`: number of input feature maps
 """
 function inception_e(inplanes)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
-
-  branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
-  branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
-                      conv_bn((3, 3), 448, 384; pad = 1)...)
-  branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1,
-                  Chain(branch3x3_1,
-                        Parallel(cat_channels,
-                                  branch3x3_1a, branch3x3_1b)),
-
-                  Chain(branch3x3_2,
-                        Parallel(cat_channels,
-                                  branch3x3_2a, branch3x3_2b)),
-                  branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
+    branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
+    branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
+                        conv_bn((3, 3), 448, 384; pad = 1)...)
+    branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1,
+                    Chain(branch3x3_1,
+                          Parallel(cat_channels,
+                                   branch3x3_1a, branch3x3_1b)),
+                    Chain(branch3x3_2,
+                          Parallel(cat_channels,
+                                   branch3x3_2a, branch3x3_2b)),
+                    branch_pool)
 end
 
 """
@@ -150,30 +131,29 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
     `inception3` does not currently support pretrained weights.
 """
 function inception3(; nclasses = 1000)
-  layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
-                      conv_bn((3, 3), 32, 32)...,
-                      conv_bn((3, 3), 32, 64; pad = 1)...,
-                      MaxPool((3, 3), stride = 2),
-                      conv_bn((1, 1), 64, 80)...,
-                      conv_bn((3, 3), 80, 192)...,
-                      MaxPool((3, 3), stride = 2),
-                      inception_a(192, 32),
-                      inception_a(256, 64),
-                      inception_a(288, 64),
-                      inception_b(288),
-                      inception_c(768, 128),
-                      inception_c(768, 160),
-                      inception_c(768, 160),
-                      inception_c(768, 192),
-                      inception_d(768),
-                      inception_e(1280),
-                      inception_e(2048)),
-                Chain(AdaptiveMeanPool((1, 1)),
-                      Dropout(0.2),
-                      MLUtils.flatten,
-                      Dense(2048, nclasses)))
-
-  return layer
+    layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
+                        conv_bn((3, 3), 32, 32)...,
+                        conv_bn((3, 3), 32, 64; pad = 1)...,
+                        MaxPool((3, 3), stride = 2),
+                        conv_bn((1, 1), 64, 80)...,
+                        conv_bn((3, 3), 80, 192)...,
+                        MaxPool((3, 3), stride = 2),
+                        inception_a(192, 32),
+                        inception_a(256, 64),
+                        inception_a(288, 64),
+                        inception_b(288),
+                        inception_c(768, 128),
+                        inception_c(768, 160),
+                        inception_c(768, 160),
+                        inception_c(768, 192),
+                        inception_d(768),
+                        inception_e(1280),
+                        inception_e(2048)),
+                  Chain(AdaptiveMeanPool((1, 1)),
+                        Dropout(0.2),
+                        MLUtils.flatten,
+                        Dense(2048, nclasses)))
+    return layer
 end
 
 """
@@ -190,14 +170,13 @@ See also [`inception3`](#).
     `Inception3` does not currently support pretrained weights.
 """
 struct Inception3
-  layers
+    layers::Any
 end
 
 function Inception3(; pretrain = false, nclasses = 1000)
-  layers = inception3(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "Inception3")
-
-  Inception3(layers)
+    layers = inception3(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "Inception3")
+    Inception3(layers)
 end
 
 @functor Inception3
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
index 2dfd06f8d..fed893142 100644
--- a/src/convnets/mobilenet.jl
+++ b/src/convnets/mobilenet.jl
@@ -27,37 +27,37 @@ function mobilenetv1(width_mult, config;
                      inchannels = 3,
                      nclasses = 1000,
                      fcsize = 1024)
-  layers = []
-  for (dw, outch, stride, nrepeats) in config
-    outch = Int(outch * width_mult)
-    for _ in 1:nrepeats
-      layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
-                                         stride = stride, pad = 1) :
-                   conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
-      append!(layers, layer)
-      inchannels = outch
+    layers = []
+    for (dw, outch, stride, nrepeats) in config
+        outch = Int(outch * width_mult)
+        for _ in 1:nrepeats
+            layer = dw ?
+                    depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
+                                          stride = stride, pad = 1) :
+                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
+            append!(layers, layer)
+            inchannels = outch
+        end
     end
-  end
-
-  return Chain(Chain(layers),
-               Chain(GlobalMeanPool(),
-                     MLUtils.flatten,
-                     Dense(inchannels, fcsize, activation),
-                     Dense(fcsize, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(GlobalMeanPool(),
+                       MLUtils.flatten,
+                       Dense(inchannels, fcsize, activation),
+                       Dense(fcsize, nclasses)))
 end
 
 const mobilenetv1_configs = [
-#     dw,    c, s, r
-  (false,   32, 2, 1),
-  ( true,   64, 1, 1),
-  ( true,  128, 2, 1),
-  ( true,  128, 1, 1),
-  ( true,  256, 2, 1),
-  ( true,  256, 1, 1),
-  ( true,  512, 2, 1),
-  ( true,  512, 1, 5),
-  ( true, 1024, 2, 1),
-  ( true, 1024, 1, 1)
+    # dw, c, s, r
+    (false, 32, 2, 1),
+    (true, 64, 1, 1),
+    (true, 128, 2, 1),
+    (true, 128, 1, 1),
+    (true, 256, 2, 1),
+    (true, 256, 1, 1),
+    (true, 512, 2, 1),
+    (true, 512, 1, 5),
+    (true, 1024, 2, 1),
+    (true, 1024, 1, 1),
 ]
 
 """
@@ -77,14 +77,13 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 See also [`Metalhead.mobilenetv1`](#).
 """
 struct MobileNetv1
-  layers
+    layers::Any
 end
 
 function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv1"))
-
-  return MobileNetv1(layers)
+    layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv1"))
+    return MobileNetv1(layers)
 end
 
 @functor MobileNetv1
@@ -95,7 +94,6 @@ backbone(m::MobileNetv1) = m.layers[1]
 classifier(m::MobileNetv1) = m.layers[2]
 
 # MobileNetv2
-
 """
     mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
 
@@ -115,44 +113,45 @@ Create a MobileNetv2 model.
 - `nclasses`: The number of output classes
 """
 function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
-
-  # building inverted residual blocks
-  for (t, c, n, s, a) in configs
-    outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
-    for i in 1:n
-      push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a;
-                                     stride = i == 1 ? s : 1))
-      inplanes = outplanes
+    # building first layer
+    inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
+    # building inverted residual blocks
+    for (t, c, n, s, a) in configs
+        outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
+        for i in 1:n
+            push!(layers,
+                  invertedresidual(3, inplanes, inplanes * t, outplanes, a;
+                                   stride = i == 1 ? s : 1))
+            inplanes = outplanes
+        end
     end
-  end
-
-  # building last several layers
-  outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
-                                 max_width
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses)))
+    # building last several layers
+    outplanes = (width_mult > 1) ?
+                _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
+                max_width
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 # Layer configurations for MobileNetv2
 const mobilenetv2_configs = [
-#  t,   c, n, s,     a
-  (1,  16, 1, 1, relu6),
-  (6,  24, 2, 2, relu6),
-  (6,  32, 3, 2, relu6),
-  (6,  64, 4, 2, relu6),
-  (6,  96, 3, 1, relu6),
-  (6, 160, 3, 2, relu6),
-  (6, 320, 1, 1, relu6)
+    # t, c, n, s,  a
+    (1, 16, 1, 1, relu6),
+    (6, 24, 2, 2, relu6),
+    (6, 32, 3, 2, relu6),
+    (6, 64, 4, 2, relu6),
+    (6, 96, 3, 1, relu6),
+    (6, 160, 3, 2, relu6),
+    (6, 320, 1, 1, relu6),
 ]
 
 # Model definition for MobileNetv2
 struct MobileNetv2
-  layers
+    layers::Any
 end
 
 """
@@ -172,10 +171,9 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 See also [`Metalhead.mobilenetv2`](#).
 """
 function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv2"))
-
-  MobileNetv2(layers)
+    layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv2"))
+    MobileNetv2(layers)
 end
 
 @functor MobileNetv2
@@ -186,7 +184,6 @@ backbone(m::MobileNetv2) = m.layers[1]
 classifier(m::MobileNetv2) = m.layers[2]
 
 # MobileNetv3
-
 """
     mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
 
@@ -208,71 +205,70 @@ Create a MobileNetv3 model.
 - `nclasses`: the number of output classes
 """
 function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(16 * width_mult, 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
-  explanes = 0
-  # building inverted residual blocks
-  for (k, t, c, r, a, s) in configs
-    # inverted residual layers
-    outplanes = _round_channels(c * width_mult, 8)
-    explanes = _round_channels(inplanes * t, 8)
-    push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a;
-                                   stride = s, reduction = r))
-    inplanes = outplanes
-  end
-
-  # building last several layers
-  output_channel = max_width
-  output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel
-  classifier = Chain(Dense(explanes, output_channel, hardswish),
-                     Dropout(0.2),
-                     Dense(output_channel, nclasses))
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
+    # building first layer
+    inplanes = _round_channels(16 * width_mult, 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
+    explanes = 0
+    # building inverted residual blocks
+    for (k, t, c, r, a, s) in configs
+        # inverted residual layers
+        outplanes = _round_channels(c * width_mult, 8)
+        explanes = _round_channels(inplanes * t, 8)
+        push!(layers,
+              invertedresidual(k, inplanes, explanes, outplanes, a;
+                               stride = s, reduction = r))
+        inplanes = outplanes
+    end
+    # building last several layers
+    output_channel = max_width
+    output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) :
+                     output_channel
+    classifier = Chain(Dense(explanes, output_channel, hardswish),
+                       Dropout(0.2),
+                       Dense(output_channel, nclasses))
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
 end
 
 # Configurations for small and large mode for MobileNetv3
-mobilenetv3_configs = Dict(
-  :small => [
-  #  k,    t,  c,      SE,         a, s
-    (3,    1, 16,       4,      relu, 2),
-    (3,  4.5, 24, nothing,      relu, 2),
-    (3, 3.67, 24, nothing,      relu, 1),
-    (5,    4, 40,       4, hardswish, 2),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 2),
-    (5,    6, 96,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 1),
-  ], 
-  :large => [
-  #  k,   t,   c,      SE,         a, s
-    (3,   1,  16, nothing,      relu, 1),
-    (3,   4,  24, nothing,      relu, 2),
-    (3,   3,  24, nothing,      relu, 1),
-    (5,   3,  40,       4,      relu, 2),
-    (5,   3,  40,       4,      relu, 1),
-    (5,   3,  40,       4,      relu, 1),
-    (3,   6,  80, nothing, hardswish, 2),
-    (3, 2.5,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 2),
-    (5,   6, 160,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 1)
-  ]
-)
+mobilenetv3_configs = Dict(:small => [
+                               # k, t,  c, SE, a, s
+                               (3, 1, 16, 4, relu, 2),
+                               (3, 4.5, 24, nothing, relu, 2),
+                               (3, 3.67, 24, nothing, relu, 1),
+                               (5, 4, 40, 4, hardswish, 2),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 2),
+                               (5, 6, 96, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 1),
+                           ],
+                           :large => [
+                               # k, t, c, SE, a, s
+                               (3, 1, 16, nothing, relu, 1),
+                               (3, 4, 24, nothing, relu, 2),
+                               (3, 3, 24, nothing, relu, 1),
+                               (5, 3, 40, 4, relu, 2),
+                               (5, 3, 40, 4, relu, 1),
+                               (5, 3, 40, 4, relu, 1),
+                               (3, 6, 80, nothing, hardswish, 2),
+                               (3, 2.5, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 2),
+                               (5, 6, 160, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 1),
+                           ])
 
 # Model definition for MobileNetv3
 struct MobileNetv3
-  layers
+    layers::Any
 end
 
 """
@@ -292,13 +288,14 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  @assert mode in [:large, :small] "`mode` has to be either :large or :small"
-
-  max_width = (mode == :large) ? 1280 : 1024
-  layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
-  MobileNetv3(layers)
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false,
+                     nclasses = 1000)
+    @assert mode in [:large, :small] "`mode` has to be either :large or :small"
+    max_width = (mode == :large) ? 1280 : 1024
+    layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width,
+                         nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
+    MobileNetv3(layers)
 end
 
 @functor MobileNetv3
diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl
index d91d65d6a..54bb5cb35 100644
--- a/src/convnets/resnet.jl
+++ b/src/convnets/resnet.jl
@@ -11,9 +11,11 @@ Create a basic residual block
 - `downsample`: set to `true` to downsample the input
 """
 function basicblock(inplanes, outplanes, downsample = false)
-  stride = downsample ? 2 : 1
-  Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1,
+                  bias = false)...,
+          conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1,
+                  bias = false)...)
 end
 
 """
@@ -36,9 +38,11 @@ The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead.
 """
 function bottleneck(inplanes, outplanes, downsample = false;
                     stride = [1, (downsample ? 2 : 1), 1])
-  Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)...,
-        conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...)
+    Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
+          conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1,
+                  bias = false)...,
+          conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3],
+                  bias = false)...)
 end
 
 """
@@ -55,8 +59,9 @@ layer which has a stride of 2.
                within the residual block
 - `downsample`: set to `true` to downsample the input
 """
-bottleneck_v1(inplanes, outplanes, downsample = false) =
+function bottleneck_v1(inplanes, outplanes, downsample = false)
     bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1])
+end
 
 """
     resnet(block, residuals::NTuple{2, Any}, connection = addrelu;
@@ -78,31 +83,33 @@ Create a ResNet model
 """
 function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu;
                 channel_config, block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 64
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes .* channel_config
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1),
-                                       residuals[i][1](inplanes, outplanes[end], i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes[end]
-    for _ in 2:nrepeats
-      push!(layers, Parallel(connection, block(inplanes, outplanes, false),
-                                         residuals[i][2](inplanes, outplanes[end], false)))
-      inplanes = outplanes[end]
+    inplanes = 64
+    baseplanes = 64
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
+    push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes .* channel_config
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection, block(inplanes, outplanes, i != 1),
+                       residuals[i][1](inplanes, outplanes[end], i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes[end]
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection, block(inplanes, outplanes, false),
+                           residuals[i][2](inplanes, outplanes[end], false)))
+            inplanes = outplanes[end]
+        end
+        # next set of output plane base is doubled
+        baseplanes *= 2
     end
-    # next set of output plane base is doubled
-    baseplanes *= 2
-  end
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -126,17 +133,14 @@ Create a ResNet model
 - `nclasses`: the number of output classes
 """
 function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...)
-  shortcut_dict = Dict(
-    :A => (skip_identity, skip_identity),
-    :B => (skip_projection, skip_identity),
-    :C => (skip_projection, skip_projection))
-
-  if any(sc -> !haskey(shortcut_dict,sc),shortcut_config)
-    error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
-  end
-
-  shortcut = [shortcut_dict[sc] for sc in shortcut_config]
-  resnet(block, shortcut, args...; kwargs...)
+    shortcut_dict = Dict(:A => (skip_identity, skip_identity),
+                         :B => (skip_projection, skip_identity),
+                         :C => (skip_projection, skip_projection))
+    if any(sc -> !haskey(shortcut_dict, sc), shortcut_config)
+        error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
+    end
+    shortcut = [shortcut_dict[sc] for sc in shortcut_config]
+    resnet(block, shortcut, args...; kwargs...)
 end
 
 function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...)
@@ -144,14 +148,15 @@ function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...
            block_config = block_config, kwargs...)
 end
 
-resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...)
+function resnet(block, residuals::NTuple{2}, args...; kwargs...)
+    resnet(block, [residuals], args...; kwargs...)
+end
 
-const resnet_config =
-  Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
-       34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
-       50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
-       101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
-       152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
+const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
+                           34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
+                           50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
+                           101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
+                           152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
 
 """
     ResNet(channel_config, block_config, shortcut_config;
@@ -173,19 +178,18 @@ See also [`resnet`](#).
 - `nclasses`: the number of output classes
 """
 struct ResNet
-  layers
+    layers::Any
 end
 
 function ResNet(channel_config, block_config, shortcut_config;
                 block, connection = addrelu, nclasses = 1000)
-  layers = resnet(block,
-                  shortcut_config,
-                  connection;
-                  channel_config = channel_config,
-                  block_config = block_config,
-                  nclasses = nclasses)
-
-  ResNet(layers)
+    layers = resnet(block,
+                    shortcut_config,
+                    connection;
+                    channel_config = channel_config,
+                    block_config = block_config,
+                    nclasses = nclasses)
+    ResNet(layers)
 end
 
 @functor ResNet
@@ -238,7 +242,6 @@ resnet50_v1 = ResNet([1, 1, 4], [3, 4, 6, 3], :B; block = Metalhead.bottleneck_v
 """
 function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000)
     @assert depth in keys(resnet_config) "`depth` must be one of $(sort(collect(keys(resnet_config))))"
-
     config, block = resnet_config[depth]
     model = ResNet(config...; block = block, nclasses = nclasses)
     pretrain && loadpretrain!(model, string("ResNet", depth))
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
index eaa66f98f..41910cb26 100644
--- a/src/convnets/resnext.jl
+++ b/src/convnets/resnext.jl
@@ -12,12 +12,12 @@ Create a basic residual block as defined in the paper for ResNeXt
 - `downsample`: set to `true` to downsample the input
 """
 function resnextblock(inplanes, outplanes, cardinality, width, downsample = false)
-  stride = downsample ? 2 : 1
-  hidden_channels = cardinality * width
-  return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
-               conv_bn((3, 3), hidden_channels, hidden_channels;
-                        stride = stride, pad = 1, bias = false, groups = cardinality)...,
-               conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    hidden_channels = cardinality * width
+    return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
+                 conv_bn((3, 3), hidden_channels, hidden_channels;
+                         stride = stride, pad = 1, bias = false, groups = cardinality)...,
+                 conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
 end
 
 """
@@ -35,33 +35,39 @@ Create a ResNeXt model
 - `block_config`: a list of the number of residual blocks at each stage
 - `nclasses`: the number of output classes
 """
-function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y);
+function resnext(cardinality, width, widen_factor = 2,
+                 connection = (x, y) -> @. relu(x) + relu(y);
                  block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 128
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes * widen_factor
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1),
-                                       skip_projection(inplanes, outplanes, i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes
-    for _ in 2:nrepeats
-        push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false),
-                                           skip_identity(inplanes, outplanes, false)))
+    inplanes = 64
+    baseplanes = 128
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
+    push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes * widen_factor
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection,
+                       resnextblock(inplanes, outplanes, cardinality, width, i != 1),
+                       skip_projection(inplanes, outplanes, i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection,
+                           resnextblock(inplanes, outplanes, cardinality, width, false),
+                           skip_identity(inplanes, outplanes, false)))
+        end
+        baseplanes = outplanes
+        # double width after every cluster of blocks
+        width *= widen_factor
     end
-    baseplanes = outplanes
-    # double width after every cluster of blocks
-    width *= widen_factor
-  end
 
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -77,12 +83,12 @@ Create a ResNeXt model
 - `nclasses`: the number of output classes
 """
 struct ResNeXt
-  layers
+    layers::Any
 end
 
 function ResNeXt(cardinality, width; block_config, nclasses = 1000)
-  layers = resnext(cardinality, width; block_config, nclasses)
-  ResNeXt(layers)
+    layers = resnext(cardinality, width; block_config, nclasses)
+    ResNeXt(layers)
 end
 
 @functor ResNeXt
@@ -92,11 +98,9 @@ end
 backbone(m::ResNeXt) = m.layers[1]
 classifier(m::ResNeXt) = m.layers[2]
 
-const resnext_config = Dict(
-  50 => (3, 4, 6, 3),
-  101 => (3, 4, 23, 3),
-  152 => (3, 8, 36, 3)
-)
+const resnext_config = Dict(50 => (3, 4, 6, 3),
+                            101 => (3, 4, 23, 3),
+                            152 => (3, 8, 36, 3))
 
 """
     ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
@@ -110,10 +114,10 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.resnext`](#).
 """
-function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
-  @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
-
-  model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
-  pretrain && loadpretrain!(model, string("ResNeXt", config))
-  model
-end
\ No newline at end of file
+function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false,
+                 nclasses = 1000)
+    @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
+    model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
+    pretrain && loadpretrain!(model, string("ResNeXt", config))
+    model
+end
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 169ad2e86..209dfb9a2 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -11,14 +11,14 @@ Create a fire module
 - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
 """
 function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
-  branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
-  branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
-  branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu)
+    branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
+    branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
+    branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu)
 
-  return Chain(branch_1,
-               Parallel(cat_channels,
-                        branch_2,
-                        branch_3))
+    return Chain(branch_1,
+                 Parallel(cat_channels,
+                          branch_2,
+                          branch_3))
 end
 
 """
@@ -28,24 +28,24 @@ Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
 """
 function squeezenet()
-  layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2),
-                       MaxPool((3, 3), stride = 2),
-                       fire(64, 16, 64, 64),
-                       fire(128, 16, 64, 64),
-                       MaxPool((3, 3), stride = 2),
-                       fire(128, 32, 128, 128),
-                       fire(256, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2),
-                       fire(256, 48, 192, 192),
-                       fire(384, 48, 192, 192),
-                       fire(384, 64, 256, 256),
-                       fire(512, 64, 256, 256),
-                       Dropout(0.5),
-                       Conv((1, 1), 512 => 1000, relu)),
-                 AdaptiveMeanPool((1, 1)),
-                 MLUtils.flatten)
+    layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2),
+                         MaxPool((3, 3), stride = 2),
+                         fire(64, 16, 64, 64),
+                         fire(128, 16, 64, 64),
+                         MaxPool((3, 3), stride = 2),
+                         fire(128, 32, 128, 128),
+                         fire(256, 32, 128, 128),
+                         MaxPool((3, 3), stride = 2),
+                         fire(256, 48, 192, 192),
+                         fire(384, 48, 192, 192),
+                         fire(384, 64, 256, 256),
+                         fire(512, 64, 256, 256),
+                         Dropout(0.5),
+                         Conv((1, 1), 512 => 1000, relu)),
+                   AdaptiveMeanPool((1, 1)),
+                   MLUtils.flatten)
 
-  return layers
+    return layers
 end
 
 """
@@ -61,14 +61,13 @@ Set `pretrain=true` to load the model with pre-trained weights for ImageNet.
 See also [`squeezenet`](#).
 """
 struct SqueezeNet
-  layers
+    layers::Any
 end
 
 function SqueezeNet(; pretrain = false)
-  layers = squeezenet()
-  pretrain && loadpretrain!(layers, "SqueezeNet")
-
-  SqueezeNet(layers)
+    layers = squeezenet()
+    pretrain && loadpretrain!(layers, "SqueezeNet")
+    SqueezeNet(layers)
 end
 
 @functor SqueezeNet
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index bdca0d9ee..2f8777297 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -11,18 +11,18 @@ A VGG block of convolution layers
 - `batchnorm`: set to `true` to include batch normalization after each convolution
 """
 function vgg_block(ifilters, ofilters, depth, batchnorm)
-  k = (3,3)
-  p = (1,1)
-  layers = []
-  for _ in 1:depth
-    if batchnorm
-      append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
-    else
-      push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
+    k = (3, 3)
+    p = (1, 1)
+    layers = []
+    for _ in 1:depth
+        if batchnorm
+            append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
+        else
+            push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
+        end
+        ifilters = ofilters
     end
-    ifilters = ofilters
-  end
-  return layers
+    return layers
 end
 
 """
@@ -38,14 +38,14 @@ Create VGG convolution layers
 - `inchannels`: number of input channels
 """
 function vgg_convolutional_layers(config, batchnorm, inchannels)
-  layers = []
-  ifilters = inchannels
-  for c in config
-    append!(layers, vgg_block(ifilters, c..., batchnorm))
-    push!(layers, MaxPool((2,2), stride=2))
-    ifilters, _ = c
-  end
-  return layers
+    layers = []
+    ifilters = inchannels
+    for c in config
+        append!(layers, vgg_block(ifilters, c..., batchnorm))
+        push!(layers, MaxPool((2, 2), stride = 2))
+        ifilters, _ = c
+    end
+    return layers
 end
 
 """
@@ -62,12 +62,12 @@ Create VGG classifier (fully connected) layers
 - `dropout`: the dropout level between each fully connected layer
 """
 function vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(MLUtils.flatten,
-               Dense(Int(prod(imsize)), fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, nclasses))
+    return Chain(MLUtils.flatten,
+                 Dense(Int(prod(imsize)), fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, nclasses))
 end
 
 """
@@ -88,16 +88,16 @@ Create a VGG model
 - `dropout`: dropout level between fully connected layers
 """
 function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  conv = vgg_convolutional_layers(config, batchnorm, inchannels)
-  imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
-  class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(Chain(conv), class)
+    conv = vgg_convolutional_layers(config, batchnorm, inchannels)
+    imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
+    class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
+    return Chain(Chain(conv), class)
 end
 
-const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)],
-                             :B => [(64,2), (128,2), (256,2), (512,2), (512,2)],
-                             :D => [(64,2), (128,2), (256,3), (512,3), (512,3)],
-                             :E => [(64,2), (128,2), (256,4), (512,4), (512,4)])
+const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)],
+                             :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)],
+                             :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)],
+                             :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)])
 
 const vgg_config = Dict(11 => :A,
                         13 => :B,
@@ -105,7 +105,7 @@ const vgg_config = Dict(11 => :A,
                         19 => :E)
 
 struct VGG
-  layers
+    layers::Any
 end
 
 """
@@ -124,14 +124,14 @@ Construct a VGG model with the specified input image size. Typically, the image
 """
 function VGG(imsize::Dims{2};
              config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  layers = vgg(imsize; config = config,
-                       inchannels = inchannels,
-                       batchnorm = batchnorm,
-                       nclasses = nclasses,
-                       fcsize = fcsize,
-                       dropout = dropout)
-
-  VGG(layers)
+    layers = vgg(imsize; config = config,
+                 inchannels = inchannels,
+                 batchnorm = batchnorm,
+                 nclasses = nclasses,
+                 fcsize = fcsize,
+                 dropout = dropout)
+
+    VGG(layers)
 end
 
 @functor VGG
@@ -155,21 +155,19 @@ See also [`VGG`](#).
 - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
 """
 function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
-  @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
-
-  model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
-                          inchannels = 3,
-                          batchnorm = batchnorm,
-                          nclasses = nclasses,
-                          fcsize = 4096,
-                          dropout = 0.5)
-
-  if pretrain && !batchnorm
-    loadpretrain!(model, string("VGG", depth))
-  elseif pretrain
-    loadpretrain!(model, "VGG$(depth)-BN)")
-  end
-  model
+    @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
+    model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
+                inchannels = 3,
+                batchnorm = batchnorm,
+                nclasses = nclasses,
+                fcsize = 4096,
+                dropout = 0.5)
+    if pretrain && !batchnorm
+        loadpretrain!(model, string("VGG", depth))
+    elseif pretrain
+        loadpretrain!(model, "VGG$(depth)-BN)")
+    end
+    model
 end
 
 # deprecations
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
index 10baf73e9..917b58c88 100644
--- a/src/layers/attention.jl
+++ b/src/layers/attention.jl
@@ -10,10 +10,10 @@ Multi-head self-attention layer.
 - `projection`: projection layer to be used after self-attention
 """
 struct MHAttention{P, Q, R}
-  nheads::Int
-  qkv_layer::P
-  attn_drop::Q
-  projection::R
+    nheads::Int
+    qkv_layer::P
+    attn_drop::Q
+    projection::R
 end
 
 """
@@ -28,31 +28,31 @@ Multi-head self-attention layer.
 - `attn_drop`: dropout rate after the self-attention layer
 - `proj_drop`: dropout rate after the projection layer
 """
-function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.)
-  @assert planes % nheads == 0 "planes should be divisible by nheads"
-  qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
-  attn_drop = Dropout(attn_drop)
-  proj = Chain(Dense(planes, planes), Dropout(proj_drop))
-
-  MHAttention(nheads, qkv_layer, attn_drop, proj)
+function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false,
+                     attn_drop = 0.0, proj_drop = 0.0)
+    @assert planes % nheads==0 "planes should be divisible by nheads"
+    qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
+    attn_drop = Dropout(attn_drop)
+    proj = Chain(Dense(planes, planes), Dropout(proj_drop))
+    MHAttention(nheads, qkv_layer, attn_drop, proj)
 end
 
 @functor MHAttention
 
 function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
-  nfeatures, seq_len, batch_size = size(x)
-  x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
-  qkv = m.qkv_layer(x_reshaped)
-  qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
-  query, key, value = chunk(qkv_reshaped, 3; dims = 4)
-  scale = convert(T, sqrt(size(query, 1) / m.nheads))
-  key_reshaped = reshape(
-    permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size
-  )
-  query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
-  value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size))
-  y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
-  return reshape(y, :, seq_len, batch_size)
+    nfeatures, seq_len, batch_size = size(x)
+    x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
+    qkv = m.qkv_layer(x_reshaped)
+    qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
+    query, key, value = chunk(qkv_reshaped, 3; dims = 4)
+    scale = convert(T, sqrt(size(query, 1) / m.nheads))
+    key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads,
+                           seq_len * batch_size)
+    query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
+    value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    pre_projection = reshape(batched_mul(attention, value_reshaped),
+                             (nfeatures, seq_len, batch_size))
+    y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
+    return reshape(y, :, seq_len, batch_size)
 end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ca30df8a4..8455a257e 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -24,28 +24,26 @@ Create a convolution + batch normalization pair with activation.
 """
 function conv_bn(kernelsize, inplanes, outplanes, activation = relu;
                  rev = false, preact = false,
-                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1,
+                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1,
                  kwargs...)
-  layers = []
-
-  if rev
-    activations = (conv = activation, bn = identity)
-    bnplanes = inplanes
-  else
-    activations = (conv = identity, bn = activation)
-    bnplanes = outplanes
-  end
-
-  if preact
-    rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
-          activations = (conv = activation, bn = identity)
-  end
-
-  push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
-  push!(layers, BatchNorm(Int(bnplanes), activations.bn;
-                          initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
-
-  return rev ? reverse(layers) : layers
+    layers = []
+    if rev
+        activations = (conv = activation, bn = identity)
+        bnplanes = inplanes
+    else
+        activations = (conv = identity, bn = activation)
+        bnplanes = outplanes
+    end
+    if preact
+        rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
+        activations = (conv = activation, bn = identity)
+    end
+    push!(layers,
+          Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
+    push!(layers,
+          BatchNorm(Int(bnplanes), activations.bn;
+                    initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
+    return rev ? reverse(layers) : layers
 end
 
 """
@@ -77,18 +75,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
 - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
 - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
 """
-depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
-                      rev = false,
-                      initβ = Flux.zeros32, initγ = Flux.ones32,
-                      ϵ = 1f-5, momentum = 1f-1,
-                      stride = 1, kwargs...) =
-  vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
-               rev = rev, initβ = initβ, initγ = initγ,
-               ϵ = ϵ, momentum = momentum,
-               stride = stride, groups = Int(inplanes), kwargs...),
-      conv_bn((1, 1), inplanes, outplanes, activation;
-              rev = rev, initβ = initβ, initγ = initγ,
-              ϵ = ϵ, momentum = momentum))
+function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
+                               rev = false,
+                               initβ = Flux.zeros32, initγ = Flux.ones32,
+                               ϵ = 1.0f-5, momentum = 1.0f-1,
+                               stride = 1, kwargs...)
+    vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
+                 rev = rev, initβ = initβ, initγ = initγ,
+                 ϵ = ϵ, momentum = momentum,
+                 stride = stride, groups = Int(inplanes), kwargs...),
+         conv_bn((1, 1), inplanes, outplanes, activation;
+                 rev = rev, initβ = initβ, initγ = initγ,
+                 ϵ = ϵ, momentum = momentum))
+end
 
 """
     skip_projection(inplanes, outplanes, downsample = false)
@@ -101,9 +100,11 @@ Create a skip projection
 - `outplanes`: the number of output feature maps
 - `downsample`: set to `true` to downsample the input
 """
-skip_projection(inplanes, outplanes, downsample = false) = downsample ?
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+function skip_projection(inplanes, outplanes, downsample = false)
+    downsample ?
+    Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
+    Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+end
 
 # array -> PaddedView(0, array, outplanes) for zero padding arrays
 """
@@ -118,15 +119,16 @@ Create a identity projection
 - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#).
 """
 function skip_identity(inplanes, outplanes)
-  if outplanes > inplanes
-    return Chain(MaxPool((1, 1), stride = 2),
-                 y -> cat(y, zeros(eltype(y),
-                                   size(y, 1),
-                                   size(y, 2),
-                                   outplanes - inplanes, size(y, 4)); dims = 3))
-  else
-    return identity
-  end
+    if outplanes > inplanes
+        return Chain(MaxPool((1, 1), stride = 2),
+                     y -> cat(y,
+                              zeros(eltype(y),
+                                    size(y, 1),
+                                    size(y, 2),
+                                    outplanes - inplanes, size(y, 4)); dims = 3))
+    else
+        return identity
+    end
 end
 skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes)
 
@@ -142,10 +144,11 @@ Squeeze and excitation layer used by MobileNet variants
                    (must be >= 1)
 """
 function squeeze_excite(channels, reduction = 4)
-  @assert (reduction >= 1) "`reduction` must be >= 1"
-  SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
-                       conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)...,
-                       conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
+    @assert (reduction>=1) "`reduction` must be >= 1"
+    SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
+                         conv_bn((1, 1), channels, channels ÷ reduction, relu;
+                                 bias = false)...,
+                         conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
 end
 
 """
@@ -166,21 +169,22 @@ Create a basic inverted residual block for MobileNet variants
                in a squeeze and excite layer (see [`squeeze_excite`](#)).
                Must be >= 1 or `nothing` for no squeeze and excite layer.
 """
-function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu;
+function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
+                          activation = relu;
                           stride, reduction = nothing)
-  @assert stride in [1, 2] "`stride` has to be 1 or 2"
-
-  pad = @. (kernel_size - 1) ÷ 2
-  conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
-  selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
-
-  invres = Chain(conv1,
-                 conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
-                         bias = false, stride, pad = pad, groups = hidden_planes)...,
-                 selayer,
-                 conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
-
-  (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+    @assert stride in [1, 2] "`stride` has to be 1 or 2"
+    pad = @. (kernel_size - 1) ÷ 2
+    conv1 = (inplanes == hidden_planes) ? identity :
+            Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
+    selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
+    invres = Chain(conv1,
+                   conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
+                           bias = false, stride, pad = pad, groups = hidden_planes)...,
+                   selayer,
+                   conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
+
+    (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+end
+function invertedresidual(kernel_size::Integer, args...; kwargs...)
+    invertedresidual((kernel_size, kernel_size), args...; kwargs...)
 end
-invertedresidual(kernel_size::Integer, args...; kwargs...) =
-  invertedresidual((kernel_size, kernel_size), args...; kwargs...)
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index fb6bc6e4d..ad4737fb2 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -20,16 +20,13 @@ patches.
 function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                         patch_size::Dims{2} = (16, 16), embedplanes = 768,
                         norm_layer = planes -> identity, flatten = true)
-
-  im_height, im_width = imsize
-  patch_height, patch_width = patch_size
-
-  @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
-  "Image dimensions must be divisible by the patch size."
-
-  return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
-               flatten ? _flatten_spatial : identity,
-               norm_layer(embedplanes))
+    im_height, im_width = imsize
+    patch_height, patch_width = patch_size
+    @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
+    "Image dimensions must be divisible by the patch size."
+    return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
+                 flatten ? _flatten_spatial : identity,
+                 norm_layer(embedplanes))
 end
 
 """
@@ -38,11 +35,13 @@ end
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
-  vectors::T
+    vectors::T
 end
 
-ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) =
-  ViPosEmbedding(init((embedsize, npatches)))
+function ViPosEmbedding(embedsize::Integer, npatches::Integer;
+                        init = (dims::Dims{2}) -> rand(Float32, dims))
+    ViPosEmbedding(init((embedsize, npatches)))
+end
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
 
@@ -54,14 +53,14 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) ->
 Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models.
 """
 struct ClassTokens{T}
-  token::T
+    token::T
 end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
 
 function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
-  tokens = m.token .* fill(one(T), (1, 1, size(x, 3)))
-  return hcat(tokens, x)
+    tokens = m.token .* fill(one(T), (1, 1, size(x, 3)))
+    return hcat(tokens, x)
 end
 
 @functor ClassTokens
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index ca8f38f97..f14ba8a8c 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -11,10 +11,10 @@ Feedforward block used in many MLPMixer-like and vision-transformer models.
 - `dropout`: Dropout rate.
 - `activation`: Activation function to use.
 """
-function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
-                   dropout = 0., activation = gelu)
-  Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
-        Dense(hidden_planes, outplanes), Dropout(dropout))
+function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes;
+                   dropout = 0.0, activation = gelu)
+    Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
+          Dense(hidden_planes, outplanes), Dropout(dropout))
 end
 
 """
@@ -33,12 +33,12 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP
 - `activation`: Activation function to use.
 """
 function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer,
-                         outplanes::Integer = inplanes; dropout = 0., activation = gelu)
-  @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP"
-  return Chain(Dense(inplanes, hidden_planes, activation),
-               Dropout(dropout),
-               gate_layer(hidden_planes),
-               Dense(hidden_planes ÷ 2, outplanes),
-               Dropout(dropout))
+                         outplanes::Integer = inplanes; dropout = 0.0, activation = gelu)
+    @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP"
+    return Chain(Dense(inplanes, hidden_planes, activation),
+                 Dropout(dropout),
+                 gate_layer(hidden_planes),
+                 Dense(hidden_planes ÷ 2, outplanes),
+                 Dropout(dropout))
 end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index a7bce3e6c..42405b563 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -12,16 +12,16 @@ Note that this is specifically for inputs with 4 dimensions in the format
 (H, W, C, N) where H, W are the height and width of the input, C is the number
 of channels, and N is the batch size.
 """
-struct ChannelLayerNorm{D,T}
-  diag::D
-  ϵ::T
+struct ChannelLayerNorm{D, T}
+    diag::D
+    ϵ::T
 end
 
 @functor ChannelLayerNorm
 
 (m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ))
 
-function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
-  diag = Flux.Scale(1, 1, sz, λ)
-  return ChannelLayerNorm(diag, ϵ)
+function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5)
+    diag = Flux.Scale(1, 1, sz, λ)
+    return ChannelLayerNorm(diag, ϵ)
 end
diff --git a/src/layers/others.jl b/src/layers/others.jl
index 366b273e4..249cacd0e 100644
--- a/src/layers/others.jl
+++ b/src/layers/others.jl
@@ -8,8 +8,9 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`"
 - `planes`: Size of channel dimension in the input.
 - `λ`: initialisation value for the learnable diagonal matrix.
 """
-LayerScale(planes::Integer, λ) =
+function LayerScale(planes::Integer, λ)
     λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity
+end
 
 """
     DropPath(p)
@@ -20,4 +21,4 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥
 # Arguments
 - `p`: rate of Stochastic Depth.
 """
-DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
\ No newline at end of file
+DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl
index 880486dc2..a88118060 100644
--- a/src/other/mlpmixer.jl
+++ b/src/other/mlpmixer.jl
@@ -15,17 +15,17 @@ Creates a feedforward block for the MLPMixer architecture.
 - `drop_path_rate`: Stochastic depth rate
 - `activation`: the activation function to use in the MLP blocks
 """
-function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, 
-                    dropout = 0., drop_path_rate = 0., activation = gelu)
-  tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
-  return Chain(SkipConnection(Chain(LayerNorm(planes),
-                                    swapdims((2, 1, 3)),
-                                    mlp_layer(npatches, tokenplanes; activation, dropout),
-                                    swapdims((2, 1, 3)),
-                                    DropPath(drop_path_rate)), +),
-               SkipConnection(Chain(LayerNorm(planes),
-                                    mlp_layer(planes, channelplanes; activation, dropout),
-                                    DropPath(drop_path_rate)), +))
+function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block,
+                    dropout = 0.0, drop_path_rate = 0.0, activation = gelu)
+    tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
+    return Chain(SkipConnection(Chain(LayerNorm(planes),
+                                      swapdims((2, 1, 3)),
+                                      mlp_layer(npatches, tokenplanes; activation, dropout),
+                                      swapdims((2, 1, 3)),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(LayerNorm(planes),
+                                      mlp_layer(planes, channelplanes; activation, dropout),
+                                      DropPath(drop_path_rate)), +))
 end
 
 """
@@ -50,27 +50,30 @@ Creates a model with the MLPMixer architecture.
 - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if 
             not specified.
 """
-function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm,
-                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
+function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3,
+                  norm_layer = LayerNorm,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
                   depth = 12, nclasses = 1000, kwargs...)
-  npatches = prod(imsize .÷ patch_size)
-  dp_rates = LinRange{Float32}(0., drop_path_rate, depth)
-  layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                 Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...)
-                  for i in 1:depth]))
-
-  classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
-  return Chain(layers, classification_head)
+    npatches = prod(imsize .÷ patch_size)
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth)
+    layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                   Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i],
+                                kwargs...)
+                          for i in 1:depth]))
+
+    classification_head = Chain(norm_layer(embedplanes), seconddimmean,
+                                Dense(embedplanes, nclasses))
+    return Chain(layers, classification_head)
 end
 
 # Configurations for MLPMixer models
-mixer_configs = Dict(:small => Dict(:depth => 8,  :planes => 512),
-                     :base  => Dict(:depth => 12, :planes => 768),
+mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512),
+                     :base => Dict(:depth => 12, :planes => 768),
                      :large => Dict(:depth => 24, :planes => 1024),
-                     :huge  => Dict(:depth => 32, :planes => 1280))
+                     :huge => Dict(:depth => 32, :planes => 1280))
 
 struct MLPMixer
-  layers
+    layers::Any
 end
 
 """
@@ -90,12 +93,13 @@ Creates a model with the MLPMixer architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                  imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses)
-  MLPMixer(layers)
+                  imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate,
+                      nclasses)
+    MLPMixer(layers)
 end
 
 @functor MLPMixer
@@ -124,21 +128,22 @@ Creates a block for the ResMixer architecture.
 - `λ`: initialisation constant for the LayerScale
 """
 function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block,
-                       dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4)
-return Chain(SkipConnection(Chain(Flux.Scale(planes),
-                                  swapdims((2, 1, 3)),
-                                  Dense(npatches, npatches),
-                                  swapdims((2, 1, 3)),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +),
-             SkipConnection(Chain(Flux.Scale(planes),
-                                  mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +))
+                       dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4)
+    return Chain(SkipConnection(Chain(Flux.Scale(planes),
+                                      swapdims((2, 1, 3)),
+                                      Dense(npatches, npatches),
+                                      swapdims((2, 1, 3)),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(Flux.Scale(planes),
+                                      mlp_layer(planes, Int(mlp_ratio * planes); dropout,
+                                                activation),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +))
 end
 
 struct ResMLP
-  layers
+    layers::Any
 end
 
 """
@@ -158,13 +163,13 @@ Creates a model with the ResMLP architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                    drop_path_rate, depth, nclasses)
-  ResMLP(layers)
+                imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
+                      drop_path_rate, depth, nclasses)
+    ResMLP(layers)
 end
 
 @functor ResMLP
@@ -185,8 +190,8 @@ Creates a spatial gating unit as described in the gMLP paper.
 - `proj`: the projection layer to use
 """
 struct SpatialGatingUnit{T, F}
-  norm::T
-  proj::F
+    norm::T
+    proj::F
 end
 
 """
@@ -201,19 +206,19 @@ Creates a spatial gating unit as described in the gMLP paper.
 - `norm_layer`: the normalisation layer to use
 """
 function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm)
-  gateplanes = planes ÷ 2
-  norm = norm_layer(gateplanes)
-  proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
-  return SpatialGatingUnit(norm, proj)
+    gateplanes = planes ÷ 2
+    norm = norm_layer(gateplanes)
+    proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
+    return SpatialGatingUnit(norm, proj)
 end
 
 @functor SpatialGatingUnit
 
 function (m::SpatialGatingUnit)(x)
-  u, v = chunk(x, 2; dims = 1)
-  v = m.norm(v)
-  v = m.proj(permutedims(v, (2, 1, 3)))
-  return u .* permutedims(v, (2, 1, 3))
+    u, v = chunk(x, 2; dims = 1)
+    v = m.norm(v)
+    v = m.proj(permutedims(v, (2, 1, 3)))
+    return u .* permutedims(v, (2, 1, 3))
 end
 
 """
@@ -235,17 +240,18 @@ Creates a feedforward block based on the gMLP model architecture described in th
 - `activation`: the activation function to use in the MLP blocks
 """
 function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm,
-                              mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0.,
+                              mlp_layer = gated_mlp_block, dropout = 0.0,
+                              drop_path_rate = 0.0,
                               activation = gelu)
-  channelplanes = Int(mlp_ratio * planes)
-  sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
-  return SkipConnection(Chain(norm_layer(planes),
-                              mlp_layer(sgu, planes, channelplanes; activation, dropout),
-                              DropPath(drop_path_rate)), +)
+    channelplanes = Int(mlp_ratio * planes)
+    sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
+    return SkipConnection(Chain(norm_layer(planes),
+                                mlp_layer(sgu, planes, channelplanes; activation, dropout),
+                                DropPath(drop_path_rate)), +)
 end
 
 struct gMLP
-  layers
+    layers::Any
 end
 
 """
@@ -265,14 +271,13 @@ Creates a model with the gMLP architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-              imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
-                    patch_size, embedplanes, drop_path_rate, depth, nclasses)
-
-  gMLP(layers)
+              imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
+                      patch_size, embedplanes, drop_path_rate, depth, nclasses)
+    gMLP(layers)
 end
 
 @functor gMLP
diff --git a/src/pretrain.jl b/src/pretrain.jl
index 97ab7398e..24e6d176d 100644
--- a/src/pretrain.jl
+++ b/src/pretrain.jl
@@ -4,17 +4,17 @@
 Load the pre-trained weights for `model` using the stored artifacts.
 """
 function weights(model)
-  try
-    path = joinpath(@artifact_str(model), "$model.bson")
-    artifact = BSON.load(path, @__MODULE__)
-    if haskey(artifact, :model)
-      return artifact[:model]
-    else
-      throw(ArgumentError("No pre-trained weights available for $model."))
+    try
+        path = joinpath(@artifact_str(model), "$model.bson")
+        artifact = BSON.load(path, @__MODULE__)
+        if haskey(artifact, :model)
+            return artifact[:model]
+        else
+            throw(ArgumentError("No pre-trained weights available for $model."))
+        end
+    catch e
+        throw(ArgumentError("No pre-trained weights available for $model."))
     end
-  catch e
-    throw(ArgumentError("No pre-trained weights available for $model."))
-  end
 end
 
 """
diff --git a/src/utilities.jl b/src/utilities.jl
index 39dbdd3b2..6adc1ec87 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -4,9 +4,9 @@ seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2)
 # utility function for making sure that all layers have a channel size divisible by 8
 # used by MobileNet variants
 function _round_channels(channels, divisor, min_value = divisor)
-  new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
-  # Make sure that round down does not go down by more than 10%
-  return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
+    new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
+    # Make sure that round down does not go down by more than 10%
+    return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
 end
 
 """
@@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm)
 
 # Utility function for pretty printing large models
 function _maybe_big_show(io, model)
-  if isdefined(Flux, :_big_show)
-    if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
-      Flux._big_show(io, model)
-    else
-      show(io, model)
+    if isdefined(Flux, :_big_show)
+        if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
+            Flux._big_show(io, model)
+        else
+            show(io, model)
+        end
     end
-  end
 end
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 53932dee1..c9f6082eb 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -11,13 +11,15 @@ Transformer as used in the base ViT architecture.
 - `mlp_ratio`: ratio of MLP layers to the number of input channels
 - `dropout`: dropout rate
 """
-function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.)
-  layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout,
-                                                             proj_drop = dropout)), +),
-                  SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes);
-                                                           dropout)), +))
-            for _ in 1:depth]
-  Chain(layers)
+function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0)
+    layers = [Chain(SkipConnection(prenorm(planes,
+                                           MHAttention(planes, nheads; attn_drop = dropout,
+                                                       proj_drop = dropout)), +),
+                    SkipConnection(prenorm(planes,
+                                           mlp_block(planes, floor(Int, mlp_ratio * planes);
+                                                     dropout)), +))
+              for _ in 1:depth]
+    Chain(layers)
 end
 
 """
@@ -44,17 +46,16 @@ Creates a Vision Transformer (ViT) model.
 function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
              embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1,
              emb_dropout = 0.1, pool = :class, nclasses = 1000)
-
-  @assert pool in [:class, :mean]
-  "Pool type must be either :class (class token) or :mean (mean pooling)"
-  npatches = prod(imsize .÷ patch_size)
-  return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                     ClassTokens(embedplanes),
-                     ViPosEmbedding(embedplanes, npatches + 1),
-                     Dropout(emb_dropout),
-                     transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
-                     (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean),
-               Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
+    @assert pool in [:class, :mean]
+    "Pool type must be either :class (class token) or :mean (mean pooling)"
+    npatches = prod(imsize .÷ patch_size)
+    return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                       ClassTokens(embedplanes),
+                       ViPosEmbedding(embedplanes, npatches + 1),
+                       Dropout(emb_dropout),
+                       transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
+                       (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean),
+                 Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
 
 vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
@@ -62,8 +63,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                    :base => (depth = 12, embedplanes = 768, nheads = 12),
                    :large => (depth = 24, embedplanes = 1024, nheads = 16),
                    :huge => (depth = 32, embedplanes = 1280, nheads = 16),
-                   :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11),
-                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13))
+                   :giant => (depth = 40, embedplanes = 1408, nheads = 16,
+                              mlp_ratio = 48 / 11),
+                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16,
+                                 mlp_ratio = 64 / 13))
 
 """
     ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3,
@@ -83,16 +86,16 @@ Creates a Vision Transformer (ViT) model.
 See also [`Metalhead.vit`](#).
 """
 struct ViT
-  layers
+    layers::Any
 end
 
 function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3,
              patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
-  @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
-  kwargs = vit_configs[mode]
-  layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
+    @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
+    kwargs = vit_configs[mode]
+    layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
 
-  ViT(layers)
+    ViT(layers)
 end
 
 (m::ViT)(x) = m.layers(x)
diff --git a/test/convnets.jl b/test/convnets.jl
index 3540c3e9f..f62ecc3fd 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -5,202 +5,192 @@ using Flux
 PRETRAINED_MODELS = []
 
 @testset "AlexNet" begin
-  model = AlexNet()
-  @test size(model(x_256)) == (1000, 1)
-  @test_throws ArgumentError AlexNet(pretrain = true)
-  @test gradtest(model, x_256)
+    model = AlexNet()
+    @test size(model(x_256)) == (1000, 1)
+    @test_throws ArgumentError AlexNet(pretrain = true)
+    @test gradtest(model, x_256)
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "VGG" begin
-  @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false]
+@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19],
+                                                            bn in [true, false]
+
     m = VGG(sz, batchnorm = bn)
 
     @test size(m(x_224)) == (1000, 1)
     if (VGG, sz, bn) in PRETRAINED_MODELS
-      @test (VGG(sz, batchnorm = bn, pretrain = true); true)
+        @test (VGG(sz, batchnorm = bn, pretrain = true); true)
     else
-      @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+        @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
 @testset "ResNet" begin
-  @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
-    m = ResNet(sz)
-
-    @test size(m(x_256)) == (1000, 1)
-    if (ResNet, sz) in PRETRAINED_MODELS
-      @test (ResNet(sz, pretrain = true); true)
-    else
-      @test_throws ArgumentError ResNet(sz, pretrain = true)
+    @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
+        m = ResNet(sz)
+
+        @test size(m(x_256)) == (1000, 1)
+        if (ResNet, sz) in PRETRAINED_MODELS
+            @test (ResNet(sz, pretrain = true); true)
+        else
+            @test_throws ArgumentError ResNet(sz, pretrain = true)
+        end
+        @test gradtest(m, x_256)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_256)
-    GC.safepoint()
-    GC.gc()
-  end
 
-  @testset "Shortcut C" begin
-    m = Metalhead.resnet(Metalhead.basicblock, :C;
-                         channel_config = [1, 1],
-                         block_config = [2, 2, 2, 2])
+    @testset "Shortcut C" begin
+        m = Metalhead.resnet(Metalhead.basicblock, :C;
+                             channel_config = [1, 1],
+                             block_config = [2, 2, 2, 2])
 
-    @test size(m(x_256)) == (1000, 1)
-    @test gradtest(m, x_256)
-  end
+        @test size(m(x_256)) == (1000, 1)
+        @test gradtest(m, x_256)
+    end
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ResNeXt" begin
-  @testset for depth in [50, 101, 152]
+@testset "ResNeXt" begin @testset for depth in [50, 101, 152]
     m = ResNeXt(depth)
 
     @test size(m(x_224)) == (1000, 1)
     if ResNeXt in PRETRAINED_MODELS
-      @test (ResNeXt(depth, pretrain = true); true)
+        @test (ResNeXt(depth, pretrain = true); true)
     else
-      @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+        @test_throws ArgumentError ResNeXt(depth, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
 @testset "GoogLeNet" begin
-  m = GoogLeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = GoogLeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "Inception3" begin
-  m = Inception3()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError Inception3(pretrain = true)
-  @test gradtest(m, x_224)
+    m = Inception3()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError Inception3(pretrain = true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "SqueezeNet" begin
-  m = SqueezeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = SqueezeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "DenseNet" begin
-  @testset for sz in [121, 161, 169, 201]
+@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201]
     m = DenseNet(sz)
 
     @test size(m(x_224)) == (1000, 1)
     if (DenseNet, sz) in PRETRAINED_MODELS
-      @test (DenseNet(sz, pretrain = true); true)
+        @test (DenseNet(sz, pretrain = true); true)
     else
-      @test_throws ArgumentError DenseNet(sz, pretrain = true)
+        @test_throws ArgumentError DenseNet(sz, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
-@testset "MobileNet" verbose = true begin
-  @testset "MobileNetv1" begin
-    m = MobileNetv1()
-
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv1 in PRETRAINED_MODELS
-      @test (MobileNetv1(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv1(pretrain = true)
+@testset "MobileNet" verbose=true begin
+    @testset "MobileNetv1" begin
+        m = MobileNetv1()
+
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv1 in PRETRAINED_MODELS
+            @test (MobileNetv1(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv1(pretrain = true)
+        end
+        @test gradtest(m, x_224)
     end
-    @test gradtest(m, x_224)
-  end
 
-  GC.safepoint()
-  GC.gc()
+    GC.safepoint()
+    GC.gc()
 
-  @testset "MobileNetv2" begin
-    m = MobileNetv2()
+    @testset "MobileNetv2" begin
+        m = MobileNetv2()
 
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv2 in PRETRAINED_MODELS
-      @test (MobileNetv2(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv2(pretrain = true)
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv2 in PRETRAINED_MODELS
+            @test (MobileNetv2(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv2(pretrain = true)
+        end
+        @test gradtest(m, x_224)
     end
-    @test gradtest(m, x_224)
-  end
-
-  GC.safepoint()
-  GC.gc()
-
-  @testset "MobileNetv3" verbose = true begin
-    @testset for mode in [:small, :large]
-      m = MobileNetv3(mode)
-
-      @test size(m(x_224)) == (1000, 1)
-      if MobileNetv3 in PRETRAINED_MODELS
-        @test (MobileNetv3(mode; pretrain = true); true)
-      else
-        @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
-      end
-      @test gradtest(m, x_224)
-    end
-  end
+
+    GC.safepoint()
+    GC.gc()
+
+    @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large]
+        m = MobileNetv3(mode)
+
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv3 in PRETRAINED_MODELS
+            @test (MobileNetv3(mode; pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+        end
+        @test gradtest(m, x_224)
+    end end
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvNeXt" verbose = true begin
-  @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
+@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = ConvNeXt(mode; drop_path_rate)
+        m = ConvNeXt(mode; drop_path_rate)
 
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvMixer" verbose = true begin
-  @testset for mode in [:small, :base, :large]
+@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large]
     m = ConvMixer(mode)
 
     @test size(m(x_224)) == (1000, 1)
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
diff --git a/test/other.jl b/test/other.jl
index 0162bc4bc..db0bf223c 100644
--- a/test/other.jl
+++ b/test/other.jl
@@ -1,38 +1,32 @@
 using Metalhead, Test
 using Flux
 
-@testset "MLPMixer" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = MLPMixer(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = MLPMixer(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
-@testset "ResMLP" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = ResMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = ResMLP(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
-@testset "gMLP" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = gMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = gMLP(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
diff --git a/test/runtests.jl b/test/runtests.jl
index 6dd4a1aa4..61af837a7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,33 +3,27 @@ using Flux
 using Flux: Zygote
 
 function gradtest(model, input)
-  y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
-  gs = pb(ones(Float32, size(y)))
+    y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
+    gs = pb(ones(Float32, size(y)))
 
-  # if we make it to here with no error, success!
-  return true
+    # if we make it to here with no error, success!
+    return true
 end
 
 x_224 = rand(Float32, 224, 224, 3, 1)
 x_256 = rand(Float32, 256, 256, 3, 1)
 
 # CNN tests
-@testset verbose = true "ConvNets" begin
-  include("convnets.jl")
-end
+@testset verbose=true "ConvNets" begin include("convnets.jl") end
 
 GC.safepoint()
 GC.gc()
 
 # Other tests
-@testset verbose = true "Other" begin
-  include("other.jl")
-end
+@testset verbose=true "Other" begin include("other.jl") end
 
 GC.safepoint()
 GC.gc()
 
 # ViT tests
-@testset verbose = true "ViTs" begin
-  include("vit-based.jl")
-end
+@testset verbose=true "ViTs" begin include("vit-based.jl") end
diff --git a/test/vit-based.jl b/test/vit-based.jl
index 20b6ecb86..ebd1a0fc2 100644
--- a/test/vit-based.jl
+++ b/test/vit-based.jl
@@ -1,12 +1,10 @@
 using Metalhead, Test
 using Flux
 
-@testset "ViT" begin
-  for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
+@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
     m = ViT(mode)
     @test size(m(x_256)) == (1000, 1)
     @test gradtest(m, x_256)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end

From c056917daf7fe9056c3ca4845e27b421ccbc8a4d Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 30 May 2022 06:20:20 +0530
Subject: [PATCH 2/8] Create .git-blame-ignore-revs

---
 .git-blame-ignore-revs | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .git-blame-ignore-revs

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..d62e45914
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,3 @@
+# .git-blame-ignore-revs
+# Switched to SciML style for code
+fd2869f57c66fa650547cd8581feeba9eda08b88

From c0b2f264126fa592832a8ad717608ea1953987a7 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Fri, 10 Jun 2022 19:16:58 +0530
Subject: [PATCH 3/8] Use `@non_differentiable` function for `fill!` in
 `ClassTokens`

Should solve at least part of #165
---
 Project.toml             | 1 +
 src/layers/Layers.jl     | 1 +
 src/layers/embeddings.jl | 7 +++++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index c010c513d..8adb95c2b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.7.1"
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl
index 1034136f3..e9aefd321 100644
--- a/src/layers/Layers.jl
+++ b/src/layers/Layers.jl
@@ -5,6 +5,7 @@ using Flux: outputsize, Zygote
 using Functors
 using Statistics
 using MLUtils
+using ChainRulesCore
 
 include("../utilities.jl")
 
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index fb6bc6e4d..5c8469aa2 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -59,9 +59,12 @@ end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
 
+_fill_like(y::AbstractArray{T, 3}) where {T} = fill!(similar(y, 1, 1, size(y, 3)), one(T))
+ChainRulesCore.@non_differentiable _fill_like(y)
+
 function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
-  tokens = m.token .* fill(one(T), (1, 1, size(x, 3)))
-  return hcat(tokens, x)
+    tokens = m.token .* _fill_like(x)
+    return hcat(tokens, x)
 end
 
 @functor ClassTokens

From 0a864d50aab32760910196de8712e23e6301731d Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Fri, 10 Jun 2022 21:04:12 +0530
Subject: [PATCH 4/8] Use `MLUtils.ones_like`

Also go back to indexing instead of `selectdim` to prevent scalar indexing on the GPU
---
 Project.toml             | 1 -
 src/layers/Layers.jl     | 1 -
 src/layers/embeddings.jl | 5 +----
 src/vit-based/vit.jl     | 2 +-
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8adb95c2b..c010c513d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.7.1"
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
-ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl
index e9aefd321..1034136f3 100644
--- a/src/layers/Layers.jl
+++ b/src/layers/Layers.jl
@@ -5,7 +5,6 @@ using Flux: outputsize, Zygote
 using Functors
 using Statistics
 using MLUtils
-using ChainRulesCore
 
 include("../utilities.jl")
 
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 5c8469aa2..06116bdc2 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -59,11 +59,8 @@ end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
 
-_fill_like(y::AbstractArray{T, 3}) where {T} = fill!(similar(y, 1, 1, size(y, 3)), one(T))
-ChainRulesCore.@non_differentiable _fill_like(y)
-
 function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T}
-    tokens = m.token .* _fill_like(x)
+    tokens = m.token .* MLUtils.ones_like(x, T, (1, 1, size(x, 3)))
     return hcat(tokens, x)
 end
 
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 53932dee1..55b3e3d30 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -53,7 +53,7 @@ function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} =
                      ViPosEmbedding(embedplanes, npatches + 1),
                      Dropout(emb_dropout),
                      transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
-                     (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean),
+                     (pool == :class) ? x -> x[:, 1, :] : seconddimmean),
                Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
 

From f4b88ec5a2eef1ca655c97e5fbeeb0c5e6c263d8 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 30 May 2022 06:19:37 +0530
Subject: [PATCH 5/8] Switch to SciML style for code

---
 .JuliaFormatter.toml       |   2 +
 src/Metalhead.jl           |  25 ++--
 src/convnets/alexnet.jl    |  43 +++---
 src/convnets/convmixer.jl  |  39 +++---
 src/convnets/convnext.jl   | 101 +++++++-------
 src/convnets/densenet.jl   |  82 ++++++------
 src/convnets/googlenet.jl  |  60 ++++-----
 src/convnets/inception.jl  | 191 ++++++++++++---------------
 src/convnets/mobilenet.jl  | 263 ++++++++++++++++++-------------------
 src/convnets/resnet.jl     | 121 ++++++++---------
 src/convnets/resnext.jl    |  94 ++++++-------
 src/convnets/squeezenet.jl |  57 ++++----
 src/convnets/vgg.jl        | 112 ++++++++--------
 src/layers/attention.jl    |  52 ++++----
 src/layers/conv.jl         | 134 ++++++++++---------
 src/layers/embeddings.jl   |  27 ++--
 src/layers/mlp.jl          |  22 ++--
 src/layers/normalise.jl    |  12 +-
 src/layers/others.jl       |   5 +-
 src/other/mlpmixer.jl      | 155 +++++++++++-----------
 src/pretrain.jl            |  20 +--
 src/utilities.jl           |  18 +--
 src/vit-based/vit.jl       |  32 +++--
 test/convnets.jl           | 200 ++++++++++++++--------------
 test/other.jl              |  48 +++----
 test/runtests.jl           |  20 +--
 test/vit-based.jl          |   6 +-
 27 files changed, 960 insertions(+), 981 deletions(-)
 create mode 100644 .JuliaFormatter.toml

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 000000000..93a9e7665
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+style = "sciml"
+whitespace_in_kwargs = true
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index a0fb3785a..e465b6981 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -37,22 +37,23 @@ include("vit-based/vit.jl")
 
 include("pretrain.jl")
 
-export  AlexNet,
-        VGG, VGG11, VGG13, VGG16, VGG19,
-        ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
-        GoogLeNet, Inception3, SqueezeNet,
-        DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
-        ResNeXt,
-        MobileNetv1, MobileNetv2, MobileNetv3,
-        MLPMixer, ResMLP, gMLP,
-        ViT,
-        ConvNeXt, ConvMixer
+export AlexNet,
+       VGG, VGG11, VGG13, VGG16, VGG19,
+       ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152,
+       GoogLeNet, Inception3, SqueezeNet,
+       DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201,
+       ResNeXt,
+       MobileNetv1, MobileNetv2, MobileNetv3,
+       MLPMixer, ResMLP, gMLP,
+       ViT,
+       ConvNeXt, ConvMixer
 
 # use Flux._big_show to pretty print large models
-for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, 
+for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet,
+          :ResNeXt,
           :MobileNetv1, :MobileNetv2, :MobileNetv3,
           :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer)
-  @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
+    @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model)
 end
 
 end # module
diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl
index ea3962c2a..93bf1cd67 100644
--- a/src/convnets/alexnet.jl
+++ b/src/convnets/alexnet.jl
@@ -8,23 +8,23 @@ Create an AlexNet model
 - `nclasses`: the number of output classes
 """
 function alexnet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((5, 5), 64 => 192, relu, pad = (2, 2)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       Conv((3, 3), 192 => 384, relu, pad = (1, 1)),
-                       Conv((3, 3), 384 => 256, relu, pad = (1, 1)),
-                       Conv((3, 3), 256 => 256, relu, pad = (1, 1)),
-                       MaxPool((3, 3), stride = (2, 2)),
-                       AdaptiveMeanPool((6,6))),
-                 Chain(MLUtils.flatten,
-                       Dropout(0.5),
-                       Dense(256 * 6 * 6, 4096, relu),
-                       Dropout(0.5),
-                       Dense(4096, 4096, relu),
-                       Dense(4096, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         Conv((5, 5), 64 => 192, relu, pad = (2, 2)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         Conv((3, 3), 192 => 384, relu, pad = (1, 1)),
+                         Conv((3, 3), 384 => 256, relu, pad = (1, 1)),
+                         Conv((3, 3), 256 => 256, relu, pad = (1, 1)),
+                         MaxPool((3, 3), stride = (2, 2)),
+                         AdaptiveMeanPool((6, 6))),
+                   Chain(MLUtils.flatten,
+                         Dropout(0.5),
+                         Dense(256 * 6 * 6, 4096, relu),
+                         Dropout(0.5),
+                         Dense(4096, 4096, relu),
+                         Dense(4096, nclasses)))
+
+    return layers
 end
 
 """
@@ -41,14 +41,13 @@ See also [`alexnet`](#).
 - `nclasses`: the number of output classes
 """
 struct AlexNet
-  layers
+    layers::Any
 end
 
 function AlexNet(; pretrain = false, nclasses = 1000)
-  layers = alexnet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "AlexNet")
-
-  AlexNet(layers)
+    layers = alexnet(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "AlexNet")
+    AlexNet(layers)
 end
 
 @functor AlexNet
diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl
index 01a6e61be..2a6aeae05 100644
--- a/src/convnets/convmixer.jl
+++ b/src/convnets/convmixer.jl
@@ -16,20 +16,24 @@ Creates a ConvMixer model.
 """
 function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9),
                    patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000)
-  stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1])
-  blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
-                                               preact = true, groups = planes, pad = SamePad())), +),
-                  conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth]
-  head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
-  return Chain(Chain(stem..., Chain(blocks)), head)
+    stem = conv_bn(patch_size, inchannels, planes, activation; preact = true,
+                   stride = patch_size[1])
+    blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation;
+                                                 preact = true, groups = planes,
+                                                 pad = SamePad())), +),
+                    conv_bn((1, 1), planes, planes, activation; preact = true)...)
+              for _ in 1:depth]
+    head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses))
+    return Chain(Chain(stem..., Chain(blocks)), head)
 end
 
 convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)),
+                                      :patch_size => (7, 7)),
                         :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7),
-                                        :patch_size => (7, 7)),
-                        :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9),
-                                         :patch_size => (7, 7)))
+                                       :patch_size => (7, 7)),
+                        :large => Dict(:planes => 1024, :depth => 20,
+                                       :kernel_size => (9, 9),
+                                       :patch_size => (7, 7)))
 
 """
     ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
@@ -44,16 +48,17 @@ Creates a ConvMixer model.
 - `nclasses`: number of classes in the output
 """
 struct ConvMixer
-  layers
+    layers::Any
 end
 
 function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000)
-  planes = convmixer_config[mode][:planes]
-  depth = convmixer_config[mode][:depth]
-  kernel_size = convmixer_config[mode][:kernel_size]
-  patch_size = convmixer_config[mode][:patch_size]
-  layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses)
-  return ConvMixer(layers)
+    planes = convmixer_config[mode][:planes]
+    depth = convmixer_config[mode][:depth]
+    kernel_size = convmixer_config[mode][:kernel_size]
+    patch_size = convmixer_config[mode][:patch_size]
+    layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation,
+                       nclasses)
+    return ConvMixer(layers)
 end
 
 @functor ConvMixer
diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl
index 1621803bf..0a44e7482 100644
--- a/src/convnets/convnext.jl
+++ b/src/convnets/convnext.jl
@@ -9,15 +9,15 @@ Creates a single block of ConvNeXt.
 - `drop_path_rate`: Stochastic depth rate.
 - `λ`: Init value for LayerScale
 """
-function convnextblock(planes, drop_path_rate = 0., λ = 1f-6)
-  layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
-                                swapdims((3, 1, 2, 4)),
-                                LayerNorm(planes; ϵ = 1f-6),
-                                mlp_block(planes, 4 * planes),
-                                LayerScale(planes, λ),
-                                swapdims((2, 3, 1, 4)),
-                                DropPath(drop_path_rate)), +)
-  return layers
+function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6)
+    layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3),
+                                  swapdims((3, 1, 2, 4)),
+                                  LayerNorm(planes; ϵ = 1.0f-6),
+                                  mlp_block(planes, 4 * planes),
+                                  LayerScale(planes, λ),
+                                  swapdims((2, 3, 1, 4)),
+                                  DropPath(drop_path_rate)), +)
+    return layers
 end
 
 """
@@ -34,45 +34,48 @@ Creates the layers for a ConvNeXt model.
 - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239)
 - `nclasses`: number of output classes
 """
-function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000)
-  @assert length(depths) == length(planes) "`planes` should have exactly one value for each block"
-
-  downsample_layers = []
-  stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
-               ChannelLayerNorm(planes[1]; ϵ = 1f-6))
-  push!(downsample_layers, stem)
-  for m in 1:length(depths) - 1
-    downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6),
-                              Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
-    push!(downsample_layers, downsample_layer)
-  end
-
-  stages = []
-  dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths))
-  cur = 0
-  for i in 1:length(depths)
-    push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
-    cur += depths[i]
-  end
-
-  backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
-  head = Chain(GlobalMeanPool(),
-               MLUtils.flatten,
-               LayerNorm(planes[end]),
-               Dense(planes[end], nclasses))
-
-  return Chain(Chain(backbone), head)
+function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
+                  nclasses = 1000)
+    @assert length(depths)==length(planes) "`planes` should have exactly one value for each block"
+    downsample_layers = []
+    stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4),
+                 ChannelLayerNorm(planes[1]; ϵ = 1.0f-6))
+    push!(downsample_layers, stem)
+    for m in 1:(length(depths) - 1)
+        downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6),
+                                 Conv((2, 2), planes[m] => planes[m + 1]; stride = 2))
+        push!(downsample_layers, downsample_layer)
+    end
+    stages = []
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths))
+    cur = 0
+    for i in 1:length(depths)
+        push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]])
+        cur += depths[i]
+    end
+    backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages))))
+    head = Chain(GlobalMeanPool(),
+                 MLUtils.flatten,
+                 LayerNorm(planes[end]),
+                 Dense(planes[end], nclasses))
+
+    return Chain(Chain(backbone), head)
 end
 
 # Configurations for ConvNeXt models
-convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]),
-                        :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]),
-                        :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]),
-                        :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]),
-                        :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048]))
+convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3],
+                                      :planes => [96, 192, 384, 768]),
+                        :small => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [96, 192, 384, 768]),
+                        :base => Dict(:depths => [3, 3, 27, 3],
+                                      :planes => [128, 256, 512, 1024]),
+                        :large => Dict(:depths => [3, 3, 27, 3],
+                                       :planes => [192, 384, 768, 1536]),
+                        :xlarge => Dict(:depths => [3, 3, 27, 3],
+                                        :planes => [256, 512, 1024, 2048]))
 
 struct ConvNeXt
-  layers
+    layers::Any
 end
 
 """
@@ -89,13 +92,13 @@ Creates a ConvNeXt model.
 
 See also [`Metalhead.convnext`](#).
 """
-function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, 
+function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6,
                   nclasses = 1000)
-  @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
-  depths = convnext_configs[mode][:depths]
-  planes = convnext_configs[mode][:planes]
-  layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
-  return ConvNeXt(layers)
+    @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))"
+    depths = convnext_configs[mode][:depths]
+    planes = convnext_configs[mode][:planes]
+    layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses)
+    return ConvNeXt(layers)
 end
 
 (m::ConvNeXt)(x) = m.layers(x)
diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl
index bda7a321d..be98509e6 100644
--- a/src/convnets/densenet.jl
+++ b/src/convnets/densenet.jl
@@ -10,11 +10,12 @@ Create a Densenet bottleneck layer
                (and scaling factor for inner feature maps; see ref)
 """
 function dense_bottleneck(inplanes, outplanes)
-  inner_channels = 4 * outplanes
-  m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
-            conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...)
+    inner_channels = 4 * outplanes
+    m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)...,
+              conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false,
+                      rev = true)...)
 
-  SkipConnection(m, cat_channels)
+    SkipConnection(m, cat_channels)
 end
 
 """
@@ -27,8 +28,10 @@ Create a DenseNet transition sequence
 - `inplanes`: number of input feature maps
 - `outplanes`: number of output feature maps
 """
-transition(inplanes, outplanes) =
-  Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2)))
+function transition(inplanes, outplanes)
+    Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)...,
+          MeanPool((2, 2)))
+end
 
 """
     dense_block(inplanes, growth_rates)
@@ -42,8 +45,10 @@ the number of output feature maps by `growth_rates` with each block
 - `growth_rates`: the growth (additive) rates of output feature maps
                   after each block (a vector of `k`s from the ref)
 """
-dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o)
-  for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)]
+function dense_block(inplanes, growth_rates)
+    [dense_bottleneck(i, o)
+     for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)]
+end
 
 """
     densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
@@ -59,24 +64,24 @@ Create a DenseNet model
 - `nclasses`: the number of output classes
 """
 function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000)
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
-  push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
-
-  outplanes = 0
-  for (i, rates) in enumerate(growth_rates)
-    outplanes = inplanes + sum(rates)
-    append!(layers, dense_block(inplanes, rates))
-    (i != length(growth_rates)) &&
-      push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
-    inplanes = floor(Int, outplanes * reduction)
-  end
-  push!(layers, BatchNorm(outplanes, relu))
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)),
-                     MLUtils.flatten,
-                     Dense(outplanes, nclasses)))
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false))
+    push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1)))
+
+    outplanes = 0
+    for (i, rates) in enumerate(growth_rates)
+        outplanes = inplanes + sum(rates)
+        append!(layers, dense_block(inplanes, rates))
+        (i != length(growth_rates)) &&
+            push!(layers, transition(outplanes, floor(Int, outplanes * reduction)))
+        inplanes = floor(Int, outplanes * reduction)
+    end
+    push!(layers, BatchNorm(outplanes, relu))
+
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)),
+                       MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 """
@@ -91,9 +96,10 @@ Create a DenseNet model
 - `reduction`: the factor by which the number of feature maps is scaled across each transition
 - `nclasses`: the number of output classes
 """
-densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) =
-  densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
-           reduction = reduction, nclasses = nclasses)
+function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000)
+    densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks];
+             reduction = reduction, nclasses = nclasses)
+end
 
 """
     DenseNet(nblocks::NTuple{N, <:Integer};
@@ -110,16 +116,16 @@ See also [`densenet`](#).
 - `nclasses`: the number of output classes
 """
 struct DenseNet
-  layers
+    layers::Any
 end
 
 function DenseNet(nblocks::NTuple{N, <:Integer};
                   growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N}
-  layers = densenet(nblocks; growth_rate = growth_rate,
-                             reduction = reduction,
-                             nclasses = nclasses)
+    layers = densenet(nblocks; growth_rate = growth_rate,
+                      reduction = reduction,
+                      nclasses = nclasses)
 
-  DenseNet(layers)
+    DenseNet(layers)
 end
 
 @functor DenseNet
@@ -148,11 +154,11 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 See also [`Metalhead.densenet`](#).
 """
 function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000)
-  @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
-  model = DenseNet(densenet_config[config]; nclasses = nclasses)
+    @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))."
+    model = DenseNet(densenet_config[config]; nclasses = nclasses)
 
-  pretrain && loadpretrain!(model, string("DenseNet", config))
-  return model
+    pretrain && loadpretrain!(model, string("DenseNet", config))
+    return model
 end
 
 # deprecations
diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl
index bc42a052f..40dd5ff41 100644
--- a/src/convnets/googlenet.jl
+++ b/src/convnets/googlenet.jl
@@ -15,16 +15,12 @@ Create an inception module for use in GoogLeNet
 """
 function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj)
     branch1 = Chain(Conv((1, 1), inplanes => out_1x1))
-
     branch2 = Chain(Conv((1, 1), inplanes => red_3x3),
                     Conv((3, 3), red_3x3 => out_3x3; pad = 1))
-
     branch3 = Chain(Conv((1, 1), inplanes => red_5x5),
-                    Conv((5, 5), red_5x5 => out_5x5; pad = 2)) 
-
-    branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1),
+                    Conv((5, 5), red_5x5 => out_5x5; pad = 2))
+    branch4 = Chain(MaxPool((3, 3), stride = 1, pad = 1),
                     Conv((1, 1), inplanes => pool_proj))
-
     return Parallel(cat_channels,
                     branch1, branch2, branch3, branch4)
 end
@@ -39,28 +35,27 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet)
 - `nclasses`: the number of output classes
 """
 function googlenet(; nclasses = 1000)
-  layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       Conv((1, 1), 64 => 64),
-                       Conv((3, 3), 64 => 192; pad = 1),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(192, 64, 96, 128, 16, 32, 32),
-                       _inceptionblock(256, 128, 128, 192, 32, 96, 64),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(480, 192, 96, 208, 16, 48, 64),
-                       _inceptionblock(512, 160, 112, 224, 24, 64, 64),
-                       _inceptionblock(512, 128, 128, 256, 24, 64, 64),
-                       _inceptionblock(512, 112, 144, 288, 32, 64, 64),
-                       _inceptionblock(528, 256, 160, 320, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2, pad = 1),
-                       _inceptionblock(832, 256, 160, 320, 32, 128, 128),
-                       _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
-                 Chain(AdaptiveMeanPool((1, 1)),
-                       MLUtils.flatten,
-                       Dropout(0.4),
-                       Dense(1024, nclasses)))
-
-  return layers
+    layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         Conv((1, 1), 64 => 64),
+                         Conv((3, 3), 64 => 192; pad = 1),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(192, 64, 96, 128, 16, 32, 32),
+                         _inceptionblock(256, 128, 128, 192, 32, 96, 64),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(480, 192, 96, 208, 16, 48, 64),
+                         _inceptionblock(512, 160, 112, 224, 24, 64, 64),
+                         _inceptionblock(512, 128, 128, 256, 24, 64, 64),
+                         _inceptionblock(512, 112, 144, 288, 32, 64, 64),
+                         _inceptionblock(528, 256, 160, 320, 32, 128, 128),
+                         MaxPool((3, 3), stride = 2, pad = 1),
+                         _inceptionblock(832, 256, 160, 320, 32, 128, 128),
+                         _inceptionblock(832, 384, 192, 384, 48, 128, 128)),
+                   Chain(AdaptiveMeanPool((1, 1)),
+                         MLUtils.flatten,
+                         Dropout(0.4),
+                         Dense(1024, nclasses)))
+    return layers
 end
 
 """
@@ -79,14 +74,13 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`)
 See also [`googlenet`](#).
 """
 struct GoogLeNet
-  layers
+    layers::Any
 end
 
 function GoogLeNet(; pretrain = false, nclasses = 1000)
-  layers = googlenet(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "GoogLeNet")
-
-  GoogLeNet(layers)
+    layers = googlenet(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "GoogLeNet")
+    GoogLeNet(layers)
 end
 
 @functor GoogLeNet
diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl
index ef8ab81ef..2673d1b8e 100644
--- a/src/convnets/inception.jl
+++ b/src/convnets/inception.jl
@@ -9,20 +9,16 @@ Create an Inception-v3 style-A module
 - `pool_proj`: the number of output feature maps for the pooling projection
 """
 function inception_a(inplanes, pool_proj)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
-
-  branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
-                    conv_bn((5, 5), 48, 64; pad = 2)...)
-
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                    conv_bn((3, 3), 64, 96; pad = 1)...,
-                    conv_bn((3, 3), 96, 96; pad = 1)...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, pool_proj)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch5x5, branch3x3, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 64))
+    branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)...,
+                      conv_bn((5, 5), 48, 64; pad = 2)...)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                      conv_bn((3, 3), 64, 96; pad = 1)...,
+                      conv_bn((3, 3), 96, 96; pad = 1)...)
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, pool_proj)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch5x5, branch3x3, branch_pool)
 end
 
 """
@@ -35,16 +31,13 @@ Create an Inception-v3 style-B module
 - `inplanes`: number of input feature maps
 """
 function inception_b(inplanes)
-  branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
-                      conv_bn((3, 3), 64, 96; pad = 1)...,
-                      conv_bn((3, 3), 96, 96; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride = 2)
-
-  return Parallel(cat_channels,
-                  branch3x3_1, branch3x3_2, branch_pool)
+    branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)...,
+                        conv_bn((3, 3), 64, 96; pad = 1)...,
+                        conv_bn((3, 3), 96, 96; stride = 2)...)
+    branch_pool = MaxPool((3, 3), stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3_1, branch3x3_2, branch_pool)
 end
 
 """
@@ -59,23 +52,19 @@ Create an Inception-v3 style-C module
 - `n`: the "grid size" (kernel size) for the convolution layers
 """
 function inception_c(inplanes, inner_planes, n = 7)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
-
-  branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
-
-  branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
-                      conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
-                      conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1, branch7x7_1, branch7x7_2, branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 192))
+    branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...)
+    branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))...,
+                        conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))...,
+                        conv_bn((1, n), inner_planes, 192; pad = (0, 3))...)
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1, branch7x7_1, branch7x7_2, branch_pool)
 end
 
 """
@@ -88,18 +77,15 @@ Create an Inception-v3 style-D module
 - `inplanes`: number of input feature maps
 """
 function inception_d(inplanes)
-  branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                    conv_bn((3, 3), 192, 320; stride = 2)...)
-
-  branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
-                      conv_bn((1, 7), 192, 192; pad = (0, 3))...,
-                      conv_bn((7, 1), 192, 192; pad = (3, 0))...,
-                      conv_bn((3, 3), 192, 192; stride = 2)...)
-
-  branch_pool = MaxPool((3, 3), stride=2)
-
-  return Parallel(cat_channels,
-                  branch3x3, branch7x7x3, branch_pool)
+    branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                      conv_bn((3, 3), 192, 320; stride = 2)...)
+    branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)...,
+                        conv_bn((1, 7), 192, 192; pad = (0, 3))...,
+                        conv_bn((7, 1), 192, 192; pad = (3, 0))...,
+                        conv_bn((3, 3), 192, 192; stride = 2)...)
+    branch_pool = MaxPool((3, 3), stride = 2)
+    return Parallel(cat_channels,
+                    branch3x3, branch7x7x3, branch_pool)
 end
 
 """
@@ -112,30 +98,25 @@ Create an Inception-v3 style-E module
 - `inplanes`: number of input feature maps
 """
 function inception_e(inplanes)
-  branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
-
-  branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
-  branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
-                      conv_bn((3, 3), 448, 384; pad = 1)...)
-  branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
-  branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
-
-  branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
-                      conv_bn((1, 1), inplanes, 192)...)
-
-  return Parallel(cat_channels,
-                  branch1x1,
-                  Chain(branch3x3_1,
-                        Parallel(cat_channels,
-                                  branch3x3_1a, branch3x3_1b)),
-
-                  Chain(branch3x3_2,
-                        Parallel(cat_channels,
-                                  branch3x3_2a, branch3x3_2b)),
-                  branch_pool)
+    branch1x1 = Chain(conv_bn((1, 1), inplanes, 320))
+    branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384))
+    branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)...,
+                        conv_bn((3, 3), 448, 384; pad = 1)...)
+    branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1)))
+    branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0)))
+    branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1),
+                        conv_bn((1, 1), inplanes, 192)...)
+    return Parallel(cat_channels,
+                    branch1x1,
+                    Chain(branch3x3_1,
+                          Parallel(cat_channels,
+                                   branch3x3_1a, branch3x3_1b)),
+                    Chain(branch3x3_2,
+                          Parallel(cat_channels,
+                                   branch3x3_2a, branch3x3_2b)),
+                    branch_pool)
 end
 
 """
@@ -150,30 +131,29 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)).
     `inception3` does not currently support pretrained weights.
 """
 function inception3(; nclasses = 1000)
-  layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
-                      conv_bn((3, 3), 32, 32)...,
-                      conv_bn((3, 3), 32, 64; pad = 1)...,
-                      MaxPool((3, 3), stride = 2),
-                      conv_bn((1, 1), 64, 80)...,
-                      conv_bn((3, 3), 80, 192)...,
-                      MaxPool((3, 3), stride = 2),
-                      inception_a(192, 32),
-                      inception_a(256, 64),
-                      inception_a(288, 64),
-                      inception_b(288),
-                      inception_c(768, 128),
-                      inception_c(768, 160),
-                      inception_c(768, 160),
-                      inception_c(768, 192),
-                      inception_d(768),
-                      inception_e(1280),
-                      inception_e(2048)),
-                Chain(AdaptiveMeanPool((1, 1)),
-                      Dropout(0.2),
-                      MLUtils.flatten,
-                      Dense(2048, nclasses)))
-
-  return layer
+    layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)...,
+                        conv_bn((3, 3), 32, 32)...,
+                        conv_bn((3, 3), 32, 64; pad = 1)...,
+                        MaxPool((3, 3), stride = 2),
+                        conv_bn((1, 1), 64, 80)...,
+                        conv_bn((3, 3), 80, 192)...,
+                        MaxPool((3, 3), stride = 2),
+                        inception_a(192, 32),
+                        inception_a(256, 64),
+                        inception_a(288, 64),
+                        inception_b(288),
+                        inception_c(768, 128),
+                        inception_c(768, 160),
+                        inception_c(768, 160),
+                        inception_c(768, 192),
+                        inception_d(768),
+                        inception_e(1280),
+                        inception_e(2048)),
+                  Chain(AdaptiveMeanPool((1, 1)),
+                        Dropout(0.2),
+                        MLUtils.flatten,
+                        Dense(2048, nclasses)))
+    return layer
 end
 
 """
@@ -190,14 +170,13 @@ See also [`inception3`](#).
     `Inception3` does not currently support pretrained weights.
 """
 struct Inception3
-  layers
+    layers::Any
 end
 
 function Inception3(; pretrain = false, nclasses = 1000)
-  layers = inception3(nclasses = nclasses)
-  pretrain && loadpretrain!(layers, "Inception3")
-
-  Inception3(layers)
+    layers = inception3(nclasses = nclasses)
+    pretrain && loadpretrain!(layers, "Inception3")
+    Inception3(layers)
 end
 
 @functor Inception3
diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl
index 2dfd06f8d..fed893142 100644
--- a/src/convnets/mobilenet.jl
+++ b/src/convnets/mobilenet.jl
@@ -27,37 +27,37 @@ function mobilenetv1(width_mult, config;
                      inchannels = 3,
                      nclasses = 1000,
                      fcsize = 1024)
-  layers = []
-  for (dw, outch, stride, nrepeats) in config
-    outch = Int(outch * width_mult)
-    for _ in 1:nrepeats
-      layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
-                                         stride = stride, pad = 1) :
-                   conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
-      append!(layers, layer)
-      inchannels = outch
+    layers = []
+    for (dw, outch, stride, nrepeats) in config
+        outch = Int(outch * width_mult)
+        for _ in 1:nrepeats
+            layer = dw ?
+                    depthwise_sep_conv_bn((3, 3), inchannels, outch, activation;
+                                          stride = stride, pad = 1) :
+                    conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1)
+            append!(layers, layer)
+            inchannels = outch
+        end
     end
-  end
-
-  return Chain(Chain(layers),
-               Chain(GlobalMeanPool(),
-                     MLUtils.flatten,
-                     Dense(inchannels, fcsize, activation),
-                     Dense(fcsize, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(GlobalMeanPool(),
+                       MLUtils.flatten,
+                       Dense(inchannels, fcsize, activation),
+                       Dense(fcsize, nclasses)))
 end
 
 const mobilenetv1_configs = [
-#     dw,    c, s, r
-  (false,   32, 2, 1),
-  ( true,   64, 1, 1),
-  ( true,  128, 2, 1),
-  ( true,  128, 1, 1),
-  ( true,  256, 2, 1),
-  ( true,  256, 1, 1),
-  ( true,  512, 2, 1),
-  ( true,  512, 1, 5),
-  ( true, 1024, 2, 1),
-  ( true, 1024, 1, 1)
+    # dw, c, s, r
+    (false, 32, 2, 1),
+    (true, 64, 1, 1),
+    (true, 128, 2, 1),
+    (true, 128, 1, 1),
+    (true, 256, 2, 1),
+    (true, 256, 1, 1),
+    (true, 512, 2, 1),
+    (true, 512, 1, 5),
+    (true, 1024, 2, 1),
+    (true, 1024, 1, 1),
 ]
 
 """
@@ -77,14 +77,13 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 See also [`Metalhead.mobilenetv1`](#).
 """
 struct MobileNetv1
-  layers
+    layers::Any
 end
 
 function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv1"))
-
-  return MobileNetv1(layers)
+    layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv1"))
+    return MobileNetv1(layers)
 end
 
 @functor MobileNetv1
@@ -95,7 +94,6 @@ backbone(m::MobileNetv1) = m.layers[1]
 classifier(m::MobileNetv1) = m.layers[2]
 
 # MobileNetv2
-
 """
     mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
 
@@ -115,44 +113,45 @@ Create a MobileNetv2 model.
 - `nclasses`: The number of output classes
 """
 function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
-
-  # building inverted residual blocks
-  for (t, c, n, s, a) in configs
-    outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
-    for i in 1:n
-      push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a;
-                                     stride = i == 1 ? s : 1))
-      inplanes = outplanes
+    # building first layer
+    inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2))
+    # building inverted residual blocks
+    for (t, c, n, s, a) in configs
+        outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8)
+        for i in 1:n
+            push!(layers,
+                  invertedresidual(3, inplanes, inplanes * t, outplanes, a;
+                                   stride = i == 1 ? s : 1))
+            inplanes = outplanes
+        end
     end
-  end
-
-  # building last several layers
-  outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
-                                 max_width
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses)))
+    # building last several layers
+    outplanes = (width_mult > 1) ?
+                _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) :
+                max_width
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(outplanes, nclasses)))
 end
 
 # Layer configurations for MobileNetv2
 const mobilenetv2_configs = [
-#  t,   c, n, s,     a
-  (1,  16, 1, 1, relu6),
-  (6,  24, 2, 2, relu6),
-  (6,  32, 3, 2, relu6),
-  (6,  64, 4, 2, relu6),
-  (6,  96, 3, 1, relu6),
-  (6, 160, 3, 2, relu6),
-  (6, 320, 1, 1, relu6)
+    # t, c, n, s,  a
+    (1, 16, 1, 1, relu6),
+    (6, 24, 2, 2, relu6),
+    (6, 32, 3, 2, relu6),
+    (6, 64, 4, 2, relu6),
+    (6, 96, 3, 1, relu6),
+    (6, 160, 3, 2, relu6),
+    (6, 320, 1, 1, relu6),
 ]
 
 # Model definition for MobileNetv2
 struct MobileNetv2
-  layers
+    layers::Any
 end
 
 """
@@ -172,10 +171,9 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet.
 See also [`Metalhead.mobilenetv2`](#).
 """
 function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv2"))
-
-  MobileNetv2(layers)
+    layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv2"))
+    MobileNetv2(layers)
 end
 
 @functor MobileNetv2
@@ -186,7 +184,6 @@ backbone(m::MobileNetv2) = m.layers[1]
 classifier(m::MobileNetv2) = m.layers[2]
 
 # MobileNetv3
-
 """
     mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
 
@@ -208,71 +205,70 @@ Create a MobileNetv3 model.
 - `nclasses`: the number of output classes
 """
 function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000)
-  # building first layer
-  inplanes = _round_channels(16 * width_mult, 8)
-  layers = []
-  append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
-  explanes = 0
-  # building inverted residual blocks
-  for (k, t, c, r, a, s) in configs
-    # inverted residual layers
-    outplanes = _round_channels(c * width_mult, 8)
-    explanes = _round_channels(inplanes * t, 8)
-    push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a;
-                                   stride = s, reduction = r))
-    inplanes = outplanes
-  end
-
-  # building last several layers
-  output_channel = max_width
-  output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel
-  classifier = Chain(Dense(explanes, output_channel, hardswish),
-                     Dropout(0.2),
-                     Dense(output_channel, nclasses))
-
-  return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
+    # building first layer
+    inplanes = _round_channels(16 * width_mult, 8)
+    layers = []
+    append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2))
+    explanes = 0
+    # building inverted residual blocks
+    for (k, t, c, r, a, s) in configs
+        # inverted residual layers
+        outplanes = _round_channels(c * width_mult, 8)
+        explanes = _round_channels(inplanes * t, 8)
+        push!(layers,
+              invertedresidual(k, inplanes, explanes, outplanes, a;
+                               stride = s, reduction = r))
+        inplanes = outplanes
+    end
+    # building last several layers
+    output_channel = max_width
+    output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) :
+                     output_channel
+    classifier = Chain(Dense(explanes, output_channel, hardswish),
+                       Dropout(0.2),
+                       Dense(output_channel, nclasses))
+    return Chain(Chain(Chain(layers),
+                       conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier))
 end
 
 # Configurations for small and large mode for MobileNetv3
-mobilenetv3_configs = Dict(
-  :small => [
-  #  k,    t,  c,      SE,         a, s
-    (3,    1, 16,       4,      relu, 2),
-    (3,  4.5, 24, nothing,      relu, 2),
-    (3, 3.67, 24, nothing,      relu, 1),
-    (5,    4, 40,       4, hardswish, 2),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    6, 40,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    3, 48,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 2),
-    (5,    6, 96,       4, hardswish, 1),
-    (5,    6, 96,       4, hardswish, 1),
-  ], 
-  :large => [
-  #  k,   t,   c,      SE,         a, s
-    (3,   1,  16, nothing,      relu, 1),
-    (3,   4,  24, nothing,      relu, 2),
-    (3,   3,  24, nothing,      relu, 1),
-    (5,   3,  40,       4,      relu, 2),
-    (5,   3,  40,       4,      relu, 1),
-    (5,   3,  40,       4,      relu, 1),
-    (3,   6,  80, nothing, hardswish, 2),
-    (3, 2.5,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3, 2.3,  80, nothing, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (3,   6, 112,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 2),
-    (5,   6, 160,       4, hardswish, 1),
-    (5,   6, 160,       4, hardswish, 1)
-  ]
-)
+mobilenetv3_configs = Dict(:small => [
+                               # k, t,  c, SE, a, s
+                               (3, 1, 16, 4, relu, 2),
+                               (3, 4.5, 24, nothing, relu, 2),
+                               (3, 3.67, 24, nothing, relu, 1),
+                               (5, 4, 40, 4, hardswish, 2),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 6, 40, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 3, 48, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 2),
+                               (5, 6, 96, 4, hardswish, 1),
+                               (5, 6, 96, 4, hardswish, 1),
+                           ],
+                           :large => [
+                               # k, t, c, SE, a, s
+                               (3, 1, 16, nothing, relu, 1),
+                               (3, 4, 24, nothing, relu, 2),
+                               (3, 3, 24, nothing, relu, 1),
+                               (5, 3, 40, 4, relu, 2),
+                               (5, 3, 40, 4, relu, 1),
+                               (5, 3, 40, 4, relu, 1),
+                               (3, 6, 80, nothing, hardswish, 2),
+                               (3, 2.5, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 2.3, 80, nothing, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (3, 6, 112, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 2),
+                               (5, 6, 160, 4, hardswish, 1),
+                               (5, 6, 160, 4, hardswish, 1),
+                           ])
 
 # Model definition for MobileNetv3
 struct MobileNetv3
-  layers
+    layers::Any
 end
 
 """
@@ -292,13 +288,14 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.mobilenetv3`](#).
 """
-function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000)
-  @assert mode in [:large, :small] "`mode` has to be either :large or :small"
-
-  max_width = (mode == :large) ? 1280 : 1024
-  layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses)
-  pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
-  MobileNetv3(layers)
+function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false,
+                     nclasses = 1000)
+    @assert mode in [:large, :small] "`mode` has to be either :large or :small"
+    max_width = (mode == :large) ? 1280 : 1024
+    layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width,
+                         nclasses = nclasses)
+    pretrain && loadpretrain!(layers, string("MobileNetv3", mode))
+    MobileNetv3(layers)
 end
 
 @functor MobileNetv3
diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl
index d91d65d6a..54bb5cb35 100644
--- a/src/convnets/resnet.jl
+++ b/src/convnets/resnet.jl
@@ -11,9 +11,11 @@ Create a basic residual block
 - `downsample`: set to `true` to downsample the input
 """
 function basicblock(inplanes, outplanes, downsample = false)
-  stride = downsample ? 2 : 1
-  Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1,
+                  bias = false)...,
+          conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1,
+                  bias = false)...)
 end
 
 """
@@ -36,9 +38,11 @@ The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead.
 """
 function bottleneck(inplanes, outplanes, downsample = false;
                     stride = [1, (downsample ? 2 : 1), 1])
-  Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
-        conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)...,
-        conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...)
+    Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)...,
+          conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1,
+                  bias = false)...,
+          conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3],
+                  bias = false)...)
 end
 
 """
@@ -55,8 +59,9 @@ layer which has a stride of 2.
                within the residual block
 - `downsample`: set to `true` to downsample the input
 """
-bottleneck_v1(inplanes, outplanes, downsample = false) =
+function bottleneck_v1(inplanes, outplanes, downsample = false)
     bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1])
+end
 
 """
     resnet(block, residuals::NTuple{2, Any}, connection = addrelu;
@@ -78,31 +83,33 @@ Create a ResNet model
 """
 function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu;
                 channel_config, block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 64
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes .* channel_config
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1),
-                                       residuals[i][1](inplanes, outplanes[end], i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes[end]
-    for _ in 2:nrepeats
-      push!(layers, Parallel(connection, block(inplanes, outplanes, false),
-                                         residuals[i][2](inplanes, outplanes[end], false)))
-      inplanes = outplanes[end]
+    inplanes = 64
+    baseplanes = 64
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false))
+    push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes .* channel_config
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection, block(inplanes, outplanes, i != 1),
+                       residuals[i][1](inplanes, outplanes[end], i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes[end]
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection, block(inplanes, outplanes, false),
+                           residuals[i][2](inplanes, outplanes[end], false)))
+            inplanes = outplanes[end]
+        end
+        # next set of output plane base is doubled
+        baseplanes *= 2
     end
-    # next set of output plane base is doubled
-    baseplanes *= 2
-  end
-
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -126,17 +133,14 @@ Create a ResNet model
 - `nclasses`: the number of output classes
 """
 function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...)
-  shortcut_dict = Dict(
-    :A => (skip_identity, skip_identity),
-    :B => (skip_projection, skip_identity),
-    :C => (skip_projection, skip_projection))
-
-  if any(sc -> !haskey(shortcut_dict,sc),shortcut_config)
-    error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
-  end
-
-  shortcut = [shortcut_dict[sc] for sc in shortcut_config]
-  resnet(block, shortcut, args...; kwargs...)
+    shortcut_dict = Dict(:A => (skip_identity, skip_identity),
+                         :B => (skip_projection, skip_identity),
+                         :C => (skip_projection, skip_projection))
+    if any(sc -> !haskey(shortcut_dict, sc), shortcut_config)
+        error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).")
+    end
+    shortcut = [shortcut_dict[sc] for sc in shortcut_config]
+    resnet(block, shortcut, args...; kwargs...)
 end
 
 function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...)
@@ -144,14 +148,15 @@ function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...
            block_config = block_config, kwargs...)
 end
 
-resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...)
+function resnet(block, residuals::NTuple{2}, args...; kwargs...)
+    resnet(block, [residuals], args...; kwargs...)
+end
 
-const resnet_config =
-  Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
-       34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
-       50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
-       101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
-       152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
+const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock),
+                           34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock),
+                           50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck),
+                           101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck),
+                           152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck))
 
 """
     ResNet(channel_config, block_config, shortcut_config;
@@ -173,19 +178,18 @@ See also [`resnet`](#).
 - `nclasses`: the number of output classes
 """
 struct ResNet
-  layers
+    layers::Any
 end
 
 function ResNet(channel_config, block_config, shortcut_config;
                 block, connection = addrelu, nclasses = 1000)
-  layers = resnet(block,
-                  shortcut_config,
-                  connection;
-                  channel_config = channel_config,
-                  block_config = block_config,
-                  nclasses = nclasses)
-
-  ResNet(layers)
+    layers = resnet(block,
+                    shortcut_config,
+                    connection;
+                    channel_config = channel_config,
+                    block_config = block_config,
+                    nclasses = nclasses)
+    ResNet(layers)
 end
 
 @functor ResNet
@@ -238,7 +242,6 @@ resnet50_v1 = ResNet([1, 1, 4], [3, 4, 6, 3], :B; block = Metalhead.bottleneck_v
 """
 function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000)
     @assert depth in keys(resnet_config) "`depth` must be one of $(sort(collect(keys(resnet_config))))"
-
     config, block = resnet_config[depth]
     model = ResNet(config...; block = block, nclasses = nclasses)
     pretrain && loadpretrain!(model, string("ResNet", depth))
diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl
index eaa66f98f..41910cb26 100644
--- a/src/convnets/resnext.jl
+++ b/src/convnets/resnext.jl
@@ -12,12 +12,12 @@ Create a basic residual block as defined in the paper for ResNeXt
 - `downsample`: set to `true` to downsample the input
 """
 function resnextblock(inplanes, outplanes, cardinality, width, downsample = false)
-  stride = downsample ? 2 : 1
-  hidden_channels = cardinality * width
-  return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
-               conv_bn((3, 3), hidden_channels, hidden_channels;
-                        stride = stride, pad = 1, bias = false, groups = cardinality)...,
-               conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
+    stride = downsample ? 2 : 1
+    hidden_channels = cardinality * width
+    return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)...,
+                 conv_bn((3, 3), hidden_channels, hidden_channels;
+                         stride = stride, pad = 1, bias = false, groups = cardinality)...,
+                 conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...)
 end
 
 """
@@ -35,33 +35,39 @@ Create a ResNeXt model
 - `block_config`: a list of the number of residual blocks at each stage
 - `nclasses`: the number of output classes
 """
-function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y);
+function resnext(cardinality, width, widen_factor = 2,
+                 connection = (x, y) -> @. relu(x) + relu(y);
                  block_config, nclasses = 1000)
-  inplanes = 64
-  baseplanes = 128
-  layers = []
-  append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
-  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
-  for (i, nrepeats) in enumerate(block_config)
-    # output planes within a block
-    outplanes = baseplanes * widen_factor
-    # push first skip connection on using first residual
-    # downsample the residual path if this is the first repetition of a block
-    push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1),
-                                       skip_projection(inplanes, outplanes, i != 1)))
-    # push remaining skip connections on using second residual
-    inplanes = outplanes
-    for _ in 2:nrepeats
-        push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false),
-                                           skip_identity(inplanes, outplanes, false)))
+    inplanes = 64
+    baseplanes = 128
+    layers = []
+    append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3)))
+    push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
+    for (i, nrepeats) in enumerate(block_config)
+        # output planes within a block
+        outplanes = baseplanes * widen_factor
+        # push first skip connection on using first residual
+        # downsample the residual path if this is the first repetition of a block
+        push!(layers,
+              Parallel(connection,
+                       resnextblock(inplanes, outplanes, cardinality, width, i != 1),
+                       skip_projection(inplanes, outplanes, i != 1)))
+        # push remaining skip connections on using second residual
+        inplanes = outplanes
+        for _ in 2:nrepeats
+            push!(layers,
+                  Parallel(connection,
+                           resnextblock(inplanes, outplanes, cardinality, width, false),
+                           skip_identity(inplanes, outplanes, false)))
+        end
+        baseplanes = outplanes
+        # double width after every cluster of blocks
+        width *= widen_factor
     end
-    baseplanes = outplanes
-    # double width after every cluster of blocks
-    width *= widen_factor
-  end
 
-  return Chain(Chain(layers),
-               Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses)))
+    return Chain(Chain(layers),
+                 Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten,
+                       Dense(inplanes, nclasses)))
 end
 
 """
@@ -77,12 +83,12 @@ Create a ResNeXt model
 - `nclasses`: the number of output classes
 """
 struct ResNeXt
-  layers
+    layers::Any
 end
 
 function ResNeXt(cardinality, width; block_config, nclasses = 1000)
-  layers = resnext(cardinality, width; block_config, nclasses)
-  ResNeXt(layers)
+    layers = resnext(cardinality, width; block_config, nclasses)
+    ResNeXt(layers)
 end
 
 @functor ResNeXt
@@ -92,11 +98,9 @@ end
 backbone(m::ResNeXt) = m.layers[1]
 classifier(m::ResNeXt) = m.layers[2]
 
-const resnext_config = Dict(
-  50 => (3, 4, 6, 3),
-  101 => (3, 4, 23, 3),
-  152 => (3, 8, 36, 3)
-)
+const resnext_config = Dict(50 => (3, 4, 6, 3),
+                            101 => (3, 4, 23, 3),
+                            152 => (3, 8, 36, 3))
 
 """
     ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
@@ -110,10 +114,10 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet.
 
 See also [`Metalhead.resnext`](#).
 """
-function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000)
-  @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
-
-  model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
-  pretrain && loadpretrain!(model, string("ResNeXt", config))
-  model
-end
\ No newline at end of file
+function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false,
+                 nclasses = 1000)
+    @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))"
+    model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses)
+    pretrain && loadpretrain!(model, string("ResNeXt", config))
+    model
+end
diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl
index 169ad2e86..209dfb9a2 100644
--- a/src/convnets/squeezenet.jl
+++ b/src/convnets/squeezenet.jl
@@ -11,14 +11,14 @@ Create a fire module
 - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution
 """
 function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes)
-  branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
-  branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
-  branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu)
+    branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu)
+    branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu)
+    branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu)
 
-  return Chain(branch_1,
-               Parallel(cat_channels,
-                        branch_2,
-                        branch_3))
+    return Chain(branch_1,
+                 Parallel(cat_channels,
+                          branch_2,
+                          branch_3))
 end
 
 """
@@ -28,24 +28,24 @@ Create a SqueezeNet
 ([reference](https://arxiv.org/abs/1602.07360v4)).
 """
 function squeezenet()
-  layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2),
-                       MaxPool((3, 3), stride = 2),
-                       fire(64, 16, 64, 64),
-                       fire(128, 16, 64, 64),
-                       MaxPool((3, 3), stride = 2),
-                       fire(128, 32, 128, 128),
-                       fire(256, 32, 128, 128),
-                       MaxPool((3, 3), stride = 2),
-                       fire(256, 48, 192, 192),
-                       fire(384, 48, 192, 192),
-                       fire(384, 64, 256, 256),
-                       fire(512, 64, 256, 256),
-                       Dropout(0.5),
-                       Conv((1, 1), 512 => 1000, relu)),
-                 AdaptiveMeanPool((1, 1)),
-                 MLUtils.flatten)
+    layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2),
+                         MaxPool((3, 3), stride = 2),
+                         fire(64, 16, 64, 64),
+                         fire(128, 16, 64, 64),
+                         MaxPool((3, 3), stride = 2),
+                         fire(128, 32, 128, 128),
+                         fire(256, 32, 128, 128),
+                         MaxPool((3, 3), stride = 2),
+                         fire(256, 48, 192, 192),
+                         fire(384, 48, 192, 192),
+                         fire(384, 64, 256, 256),
+                         fire(512, 64, 256, 256),
+                         Dropout(0.5),
+                         Conv((1, 1), 512 => 1000, relu)),
+                   AdaptiveMeanPool((1, 1)),
+                   MLUtils.flatten)
 
-  return layers
+    return layers
 end
 
 """
@@ -61,14 +61,13 @@ Set `pretrain=true` to load the model with pre-trained weights for ImageNet.
 See also [`squeezenet`](#).
 """
 struct SqueezeNet
-  layers
+    layers::Any
 end
 
 function SqueezeNet(; pretrain = false)
-  layers = squeezenet()
-  pretrain && loadpretrain!(layers, "SqueezeNet")
-
-  SqueezeNet(layers)
+    layers = squeezenet()
+    pretrain && loadpretrain!(layers, "SqueezeNet")
+    SqueezeNet(layers)
 end
 
 @functor SqueezeNet
diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl
index bdca0d9ee..2f8777297 100644
--- a/src/convnets/vgg.jl
+++ b/src/convnets/vgg.jl
@@ -11,18 +11,18 @@ A VGG block of convolution layers
 - `batchnorm`: set to `true` to include batch normalization after each convolution
 """
 function vgg_block(ifilters, ofilters, depth, batchnorm)
-  k = (3,3)
-  p = (1,1)
-  layers = []
-  for _ in 1:depth
-    if batchnorm
-      append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
-    else
-      push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
+    k = (3, 3)
+    p = (1, 1)
+    layers = []
+    for _ in 1:depth
+        if batchnorm
+            append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false))
+        else
+            push!(layers, Conv(k, ifilters => ofilters, relu, pad = p))
+        end
+        ifilters = ofilters
     end
-    ifilters = ofilters
-  end
-  return layers
+    return layers
 end
 
 """
@@ -38,14 +38,14 @@ Create VGG convolution layers
 - `inchannels`: number of input channels
 """
 function vgg_convolutional_layers(config, batchnorm, inchannels)
-  layers = []
-  ifilters = inchannels
-  for c in config
-    append!(layers, vgg_block(ifilters, c..., batchnorm))
-    push!(layers, MaxPool((2,2), stride=2))
-    ifilters, _ = c
-  end
-  return layers
+    layers = []
+    ifilters = inchannels
+    for c in config
+        append!(layers, vgg_block(ifilters, c..., batchnorm))
+        push!(layers, MaxPool((2, 2), stride = 2))
+        ifilters, _ = c
+    end
+    return layers
 end
 
 """
@@ -62,12 +62,12 @@ Create VGG classifier (fully connected) layers
 - `dropout`: the dropout level between each fully connected layer
 """
 function vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(MLUtils.flatten,
-               Dense(Int(prod(imsize)), fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, fcsize, relu),
-               Dropout(dropout),
-               Dense(fcsize, nclasses))
+    return Chain(MLUtils.flatten,
+                 Dense(Int(prod(imsize)), fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, fcsize, relu),
+                 Dropout(dropout),
+                 Dense(fcsize, nclasses))
 end
 
 """
@@ -88,16 +88,16 @@ Create a VGG model
 - `dropout`: dropout level between fully connected layers
 """
 function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  conv = vgg_convolutional_layers(config, batchnorm, inchannels)
-  imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
-  class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
-  return Chain(Chain(conv), class)
+    conv = vgg_convolutional_layers(config, batchnorm, inchannels)
+    imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3]
+    class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout)
+    return Chain(Chain(conv), class)
 end
 
-const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)],
-                             :B => [(64,2), (128,2), (256,2), (512,2), (512,2)],
-                             :D => [(64,2), (128,2), (256,3), (512,3), (512,3)],
-                             :E => [(64,2), (128,2), (256,4), (512,4), (512,4)])
+const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)],
+                             :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)],
+                             :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)],
+                             :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)])
 
 const vgg_config = Dict(11 => :A,
                         13 => :B,
@@ -105,7 +105,7 @@ const vgg_config = Dict(11 => :A,
                         19 => :E)
 
 struct VGG
-  layers
+    layers::Any
 end
 
 """
@@ -124,14 +124,14 @@ Construct a VGG model with the specified input image size. Typically, the image
 """
 function VGG(imsize::Dims{2};
              config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
-  layers = vgg(imsize; config = config,
-                       inchannels = inchannels,
-                       batchnorm = batchnorm,
-                       nclasses = nclasses,
-                       fcsize = fcsize,
-                       dropout = dropout)
-
-  VGG(layers)
+    layers = vgg(imsize; config = config,
+                 inchannels = inchannels,
+                 batchnorm = batchnorm,
+                 nclasses = nclasses,
+                 fcsize = fcsize,
+                 dropout = dropout)
+
+    VGG(layers)
 end
 
 @functor VGG
@@ -155,21 +155,19 @@ See also [`VGG`](#).
 - `pretrain`: set to `true` to load pre-trained model weights for ImageNet
 """
 function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000)
-  @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
-
-  model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
-                          inchannels = 3,
-                          batchnorm = batchnorm,
-                          nclasses = nclasses,
-                          fcsize = 4096,
-                          dropout = 0.5)
-
-  if pretrain && !batchnorm
-    loadpretrain!(model, string("VGG", depth))
-  elseif pretrain
-    loadpretrain!(model, "VGG$(depth)-BN)")
-  end
-  model
+    @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))"
+    model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]],
+                inchannels = 3,
+                batchnorm = batchnorm,
+                nclasses = nclasses,
+                fcsize = 4096,
+                dropout = 0.5)
+    if pretrain && !batchnorm
+        loadpretrain!(model, string("VGG", depth))
+    elseif pretrain
+        loadpretrain!(model, "VGG$(depth)-BN)")
+    end
+    model
 end
 
 # deprecations
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
index 10baf73e9..917b58c88 100644
--- a/src/layers/attention.jl
+++ b/src/layers/attention.jl
@@ -10,10 +10,10 @@ Multi-head self-attention layer.
 - `projection`: projection layer to be used after self-attention
 """
 struct MHAttention{P, Q, R}
-  nheads::Int
-  qkv_layer::P
-  attn_drop::Q
-  projection::R
+    nheads::Int
+    qkv_layer::P
+    attn_drop::Q
+    projection::R
 end
 
 """
@@ -28,31 +28,31 @@ Multi-head self-attention layer.
 - `attn_drop`: dropout rate after the self-attention layer
 - `proj_drop`: dropout rate after the projection layer
 """
-function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.)
-  @assert planes % nheads == 0 "planes should be divisible by nheads"
-  qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
-  attn_drop = Dropout(attn_drop)
-  proj = Chain(Dense(planes, planes), Dropout(proj_drop))
-
-  MHAttention(nheads, qkv_layer, attn_drop, proj)
+function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false,
+                     attn_drop = 0.0, proj_drop = 0.0)
+    @assert planes % nheads==0 "planes should be divisible by nheads"
+    qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
+    attn_drop = Dropout(attn_drop)
+    proj = Chain(Dense(planes, planes), Dropout(proj_drop))
+    MHAttention(nheads, qkv_layer, attn_drop, proj)
 end
 
 @functor MHAttention
 
 function (m::MHAttention)(x::AbstractArray{T, 3}) where {T}
-  nfeatures, seq_len, batch_size = size(x)
-  x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
-  qkv = m.qkv_layer(x_reshaped)
-  qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
-  query, key, value = chunk(qkv_reshaped, 3; dims = 4)
-  scale = convert(T, sqrt(size(query, 1) / m.nheads))
-  key_reshaped = reshape(
-    permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size
-  )
-  query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
-  value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
-  pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size))
-  y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
-  return reshape(y, :, seq_len, batch_size)
+    nfeatures, seq_len, batch_size = size(x)
+    x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
+    qkv = m.qkv_layer(x_reshaped)
+    qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
+    query, key, value = chunk(qkv_reshaped, 3; dims = 4)
+    scale = convert(T, sqrt(size(query, 1) / m.nheads))
+    key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads,
+                           seq_len * batch_size)
+    query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale))
+    value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size)
+    pre_projection = reshape(batched_mul(attention, value_reshaped),
+                             (nfeatures, seq_len, batch_size))
+    y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
+    return reshape(y, :, seq_len, batch_size)
 end
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index ca30df8a4..8455a257e 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -24,28 +24,26 @@ Create a convolution + batch normalization pair with activation.
 """
 function conv_bn(kernelsize, inplanes, outplanes, activation = relu;
                  rev = false, preact = false,
-                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1,
+                 initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1,
                  kwargs...)
-  layers = []
-
-  if rev
-    activations = (conv = activation, bn = identity)
-    bnplanes = inplanes
-  else
-    activations = (conv = identity, bn = activation)
-    bnplanes = outplanes
-  end
-
-  if preact
-    rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
-          activations = (conv = activation, bn = identity)
-  end
-
-  push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
-  push!(layers, BatchNorm(Int(bnplanes), activations.bn;
-                          initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
-
-  return rev ? reverse(layers) : layers
+    layers = []
+    if rev
+        activations = (conv = activation, bn = identity)
+        bnplanes = inplanes
+    else
+        activations = (conv = identity, bn = activation)
+        bnplanes = outplanes
+    end
+    if preact
+        rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) :
+        activations = (conv = activation, bn = identity)
+    end
+    push!(layers,
+          Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...))
+    push!(layers,
+          BatchNorm(Int(bnplanes), activations.bn;
+                    initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum))
+    return rev ? reverse(layers) : layers
 end
 
 """
@@ -77,18 +75,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1).
 - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#))
 - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#))
 """
-depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
-                      rev = false,
-                      initβ = Flux.zeros32, initγ = Flux.ones32,
-                      ϵ = 1f-5, momentum = 1f-1,
-                      stride = 1, kwargs...) =
-  vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
-               rev = rev, initβ = initβ, initγ = initγ,
-               ϵ = ϵ, momentum = momentum,
-               stride = stride, groups = Int(inplanes), kwargs...),
-      conv_bn((1, 1), inplanes, outplanes, activation;
-              rev = rev, initβ = initβ, initγ = initγ,
-              ϵ = ϵ, momentum = momentum))
+function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu;
+                               rev = false,
+                               initβ = Flux.zeros32, initγ = Flux.ones32,
+                               ϵ = 1.0f-5, momentum = 1.0f-1,
+                               stride = 1, kwargs...)
+    vcat(conv_bn(kernelsize, inplanes, inplanes, activation;
+                 rev = rev, initβ = initβ, initγ = initγ,
+                 ϵ = ϵ, momentum = momentum,
+                 stride = stride, groups = Int(inplanes), kwargs...),
+         conv_bn((1, 1), inplanes, outplanes, activation;
+                 rev = rev, initβ = initβ, initγ = initγ,
+                 ϵ = ϵ, momentum = momentum))
+end
 
 """
     skip_projection(inplanes, outplanes, downsample = false)
@@ -101,9 +100,11 @@ Create a skip projection
 - `outplanes`: the number of output feature maps
 - `downsample`: set to `true` to downsample the input
 """
-skip_projection(inplanes, outplanes, downsample = false) = downsample ?
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
-  Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+function skip_projection(inplanes, outplanes, downsample = false)
+    downsample ?
+    Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) :
+    Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false))
+end
 
 # array -> PaddedView(0, array, outplanes) for zero padding arrays
 """
@@ -118,15 +119,16 @@ Create a identity projection
 - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#).
 """
 function skip_identity(inplanes, outplanes)
-  if outplanes > inplanes
-    return Chain(MaxPool((1, 1), stride = 2),
-                 y -> cat(y, zeros(eltype(y),
-                                   size(y, 1),
-                                   size(y, 2),
-                                   outplanes - inplanes, size(y, 4)); dims = 3))
-  else
-    return identity
-  end
+    if outplanes > inplanes
+        return Chain(MaxPool((1, 1), stride = 2),
+                     y -> cat(y,
+                              zeros(eltype(y),
+                                    size(y, 1),
+                                    size(y, 2),
+                                    outplanes - inplanes, size(y, 4)); dims = 3))
+    else
+        return identity
+    end
 end
 skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes)
 
@@ -142,10 +144,11 @@ Squeeze and excitation layer used by MobileNet variants
                    (must be >= 1)
 """
 function squeeze_excite(channels, reduction = 4)
-  @assert (reduction >= 1) "`reduction` must be >= 1"
-  SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
-                       conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)...,
-                       conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
+    @assert (reduction>=1) "`reduction` must be >= 1"
+    SkipConnection(Chain(AdaptiveMeanPool((1, 1)),
+                         conv_bn((1, 1), channels, channels ÷ reduction, relu;
+                                 bias = false)...,
+                         conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*)
 end
 
 """
@@ -166,21 +169,22 @@ Create a basic inverted residual block for MobileNet variants
                in a squeeze and excite layer (see [`squeeze_excite`](#)).
                Must be >= 1 or `nothing` for no squeeze and excite layer.
 """
-function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu;
+function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes,
+                          activation = relu;
                           stride, reduction = nothing)
-  @assert stride in [1, 2] "`stride` has to be 1 or 2"
-
-  pad = @. (kernel_size - 1) ÷ 2
-  conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
-  selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
-
-  invres = Chain(conv1,
-                 conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
-                         bias = false, stride, pad = pad, groups = hidden_planes)...,
-                 selayer,
-                 conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
-
-  (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+    @assert stride in [1, 2] "`stride` has to be 1 or 2"
+    pad = @. (kernel_size - 1) ÷ 2
+    conv1 = (inplanes == hidden_planes) ? identity :
+            Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false))
+    selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction)
+    invres = Chain(conv1,
+                   conv_bn(kernel_size, hidden_planes, hidden_planes, activation;
+                           bias = false, stride, pad = pad, groups = hidden_planes)...,
+                   selayer,
+                   conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...)
+
+    (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres
+end
+function invertedresidual(kernel_size::Integer, args...; kwargs...)
+    invertedresidual((kernel_size, kernel_size), args...; kwargs...)
 end
-invertedresidual(kernel_size::Integer, args...; kwargs...) =
-  invertedresidual((kernel_size, kernel_size), args...; kwargs...)
diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl
index 06116bdc2..37b6f4f7b 100644
--- a/src/layers/embeddings.jl
+++ b/src/layers/embeddings.jl
@@ -20,16 +20,13 @@ patches.
 function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3,
                         patch_size::Dims{2} = (16, 16), embedplanes = 768,
                         norm_layer = planes -> identity, flatten = true)
-
-  im_height, im_width = imsize
-  patch_height, patch_width = patch_size
-
-  @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
-  "Image dimensions must be divisible by the patch size."
-
-  return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
-               flatten ? _flatten_spatial : identity,
-               norm_layer(embedplanes))
+    im_height, im_width = imsize
+    patch_height, patch_width = patch_size
+    @assert (im_height % patch_height == 0) && (im_width % patch_width == 0)
+    "Image dimensions must be divisible by the patch size."
+    return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size),
+                 flatten ? _flatten_spatial : identity,
+                 norm_layer(embedplanes))
 end
 
 """
@@ -38,11 +35,13 @@ end
 Positional embedding layer used by many vision transformer-like models.
 """
 struct ViPosEmbedding{T}
-  vectors::T
+    vectors::T
 end
 
-ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) =
-  ViPosEmbedding(init((embedsize, npatches)))
+function ViPosEmbedding(embedsize::Integer, npatches::Integer;
+                        init = (dims::Dims{2}) -> rand(Float32, dims))
+    ViPosEmbedding(init((embedsize, npatches)))
+end
 
 (p::ViPosEmbedding)(x) = x .+ p.vectors
 
@@ -54,7 +53,7 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) ->
 Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models.
 """
 struct ClassTokens{T}
-  token::T
+    token::T
 end
 
 ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1))
diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl
index ca8f38f97..f14ba8a8c 100644
--- a/src/layers/mlp.jl
+++ b/src/layers/mlp.jl
@@ -11,10 +11,10 @@ Feedforward block used in many MLPMixer-like and vision-transformer models.
 - `dropout`: Dropout rate.
 - `activation`: Activation function to use.
 """
-function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; 
-                   dropout = 0., activation = gelu)
-  Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
-        Dense(hidden_planes, outplanes), Dropout(dropout))
+function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes;
+                   dropout = 0.0, activation = gelu)
+    Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout),
+          Dense(hidden_planes, outplanes), Dropout(dropout))
 end
 
 """
@@ -33,12 +33,12 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP
 - `activation`: Activation function to use.
 """
 function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer,
-                         outplanes::Integer = inplanes; dropout = 0., activation = gelu)
-  @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP"
-  return Chain(Dense(inplanes, hidden_planes, activation),
-               Dropout(dropout),
-               gate_layer(hidden_planes),
-               Dense(hidden_planes ÷ 2, outplanes),
-               Dropout(dropout))
+                         outplanes::Integer = inplanes; dropout = 0.0, activation = gelu)
+    @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP"
+    return Chain(Dense(inplanes, hidden_planes, activation),
+                 Dropout(dropout),
+                 gate_layer(hidden_planes),
+                 Dense(hidden_planes ÷ 2, outplanes),
+                 Dropout(dropout))
 end
 gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index a7bce3e6c..42405b563 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -12,16 +12,16 @@ Note that this is specifically for inputs with 4 dimensions in the format
 (H, W, C, N) where H, W are the height and width of the input, C is the number
 of channels, and N is the batch size.
 """
-struct ChannelLayerNorm{D,T}
-  diag::D
-  ϵ::T
+struct ChannelLayerNorm{D, T}
+    diag::D
+    ϵ::T
 end
 
 @functor ChannelLayerNorm
 
 (m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ))
 
-function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5)
-  diag = Flux.Scale(1, 1, sz, λ)
-  return ChannelLayerNorm(diag, ϵ)
+function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5)
+    diag = Flux.Scale(1, 1, sz, λ)
+    return ChannelLayerNorm(diag, ϵ)
 end
diff --git a/src/layers/others.jl b/src/layers/others.jl
index 366b273e4..249cacd0e 100644
--- a/src/layers/others.jl
+++ b/src/layers/others.jl
@@ -8,8 +8,9 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`"
 - `planes`: Size of channel dimension in the input.
 - `λ`: initialisation value for the learnable diagonal matrix.
 """
-LayerScale(planes::Integer, λ) =
+function LayerScale(planes::Integer, λ)
     λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity
+end
 
 """
     DropPath(p)
@@ -20,4 +21,4 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥
 # Arguments
 - `p`: rate of Stochastic Depth.
 """
-DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
\ No newline at end of file
+DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity
diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl
index 880486dc2..a88118060 100644
--- a/src/other/mlpmixer.jl
+++ b/src/other/mlpmixer.jl
@@ -15,17 +15,17 @@ Creates a feedforward block for the MLPMixer architecture.
 - `drop_path_rate`: Stochastic depth rate
 - `activation`: the activation function to use in the MLP blocks
 """
-function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, 
-                    dropout = 0., drop_path_rate = 0., activation = gelu)
-  tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
-  return Chain(SkipConnection(Chain(LayerNorm(planes),
-                                    swapdims((2, 1, 3)),
-                                    mlp_layer(npatches, tokenplanes; activation, dropout),
-                                    swapdims((2, 1, 3)),
-                                    DropPath(drop_path_rate)), +),
-               SkipConnection(Chain(LayerNorm(planes),
-                                    mlp_layer(planes, channelplanes; activation, dropout),
-                                    DropPath(drop_path_rate)), +))
+function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block,
+                    dropout = 0.0, drop_path_rate = 0.0, activation = gelu)
+    tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio]
+    return Chain(SkipConnection(Chain(LayerNorm(planes),
+                                      swapdims((2, 1, 3)),
+                                      mlp_layer(npatches, tokenplanes; activation, dropout),
+                                      swapdims((2, 1, 3)),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(LayerNorm(planes),
+                                      mlp_layer(planes, channelplanes; activation, dropout),
+                                      DropPath(drop_path_rate)), +))
 end
 
 """
@@ -50,27 +50,30 @@ Creates a model with the MLPMixer architecture.
 - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if 
             not specified.
 """
-function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm,
-                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.,
+function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3,
+                  norm_layer = LayerNorm,
+                  patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0,
                   depth = 12, nclasses = 1000, kwargs...)
-  npatches = prod(imsize .÷ patch_size)
-  dp_rates = LinRange{Float32}(0., drop_path_rate, depth)
-  layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                 Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...)
-                  for i in 1:depth]))
-
-  classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses))
-  return Chain(layers, classification_head)
+    npatches = prod(imsize .÷ patch_size)
+    dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth)
+    layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                   Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i],
+                                kwargs...)
+                          for i in 1:depth]))
+
+    classification_head = Chain(norm_layer(embedplanes), seconddimmean,
+                                Dense(embedplanes, nclasses))
+    return Chain(layers, classification_head)
 end
 
 # Configurations for MLPMixer models
-mixer_configs = Dict(:small => Dict(:depth => 8,  :planes => 512),
-                     :base  => Dict(:depth => 12, :planes => 768),
+mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512),
+                     :base => Dict(:depth => 12, :planes => 768),
                      :large => Dict(:depth => 24, :planes => 1024),
-                     :huge  => Dict(:depth => 32, :planes => 1280))
+                     :huge => Dict(:depth => 32, :planes => 1280))
 
 struct MLPMixer
-  layers
+    layers::Any
 end
 
 """
@@ -90,12 +93,13 @@ Creates a model with the MLPMixer architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                  imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses)
-  MLPMixer(layers)
+                  imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate,
+                      nclasses)
+    MLPMixer(layers)
 end
 
 @functor MLPMixer
@@ -124,21 +128,22 @@ Creates a block for the ResMixer architecture.
 - `λ`: initialisation constant for the LayerScale
 """
 function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block,
-                       dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4)
-return Chain(SkipConnection(Chain(Flux.Scale(planes),
-                                  swapdims((2, 1, 3)),
-                                  Dense(npatches, npatches),
-                                  swapdims((2, 1, 3)),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +),
-             SkipConnection(Chain(Flux.Scale(planes),
-                                  mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation),
-                                  LayerScale(planes, λ),
-                                  DropPath(drop_path_rate)), +))
+                       dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4)
+    return Chain(SkipConnection(Chain(Flux.Scale(planes),
+                                      swapdims((2, 1, 3)),
+                                      Dense(npatches, npatches),
+                                      swapdims((2, 1, 3)),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +),
+                 SkipConnection(Chain(Flux.Scale(planes),
+                                      mlp_layer(planes, Int(mlp_ratio * planes); dropout,
+                                                activation),
+                                      LayerScale(planes, λ),
+                                      DropPath(drop_path_rate)), +))
 end
 
 struct ResMLP
-  layers
+    layers::Any
 end
 
 """
@@ -158,13 +163,13 @@ Creates a model with the ResMLP architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-                imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
-                    drop_path_rate, depth, nclasses)
-  ResMLP(layers)
+                imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes,
+                      drop_path_rate, depth, nclasses)
+    ResMLP(layers)
 end
 
 @functor ResMLP
@@ -185,8 +190,8 @@ Creates a spatial gating unit as described in the gMLP paper.
 - `proj`: the projection layer to use
 """
 struct SpatialGatingUnit{T, F}
-  norm::T
-  proj::F
+    norm::T
+    proj::F
 end
 
 """
@@ -201,19 +206,19 @@ Creates a spatial gating unit as described in the gMLP paper.
 - `norm_layer`: the normalisation layer to use
 """
 function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm)
-  gateplanes = planes ÷ 2
-  norm = norm_layer(gateplanes)
-  proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
-  return SpatialGatingUnit(norm, proj)
+    gateplanes = planes ÷ 2
+    norm = norm_layer(gateplanes)
+    proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches))
+    return SpatialGatingUnit(norm, proj)
 end
 
 @functor SpatialGatingUnit
 
 function (m::SpatialGatingUnit)(x)
-  u, v = chunk(x, 2; dims = 1)
-  v = m.norm(v)
-  v = m.proj(permutedims(v, (2, 1, 3)))
-  return u .* permutedims(v, (2, 1, 3))
+    u, v = chunk(x, 2; dims = 1)
+    v = m.norm(v)
+    v = m.proj(permutedims(v, (2, 1, 3)))
+    return u .* permutedims(v, (2, 1, 3))
 end
 
 """
@@ -235,17 +240,18 @@ Creates a feedforward block based on the gMLP model architecture described in th
 - `activation`: the activation function to use in the MLP blocks
 """
 function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm,
-                              mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0.,
+                              mlp_layer = gated_mlp_block, dropout = 0.0,
+                              drop_path_rate = 0.0,
                               activation = gelu)
-  channelplanes = Int(mlp_ratio * planes)
-  sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
-  return SkipConnection(Chain(norm_layer(planes),
-                              mlp_layer(sgu, planes, channelplanes; activation, dropout),
-                              DropPath(drop_path_rate)), +)
+    channelplanes = Int(mlp_ratio * planes)
+    sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer)
+    return SkipConnection(Chain(norm_layer(planes),
+                                mlp_layer(sgu, planes, channelplanes; activation, dropout),
+                                DropPath(drop_path_rate)), +)
 end
 
 struct gMLP
-  layers
+    layers::Any
 end
 
 """
@@ -265,14 +271,13 @@ Creates a model with the gMLP architecture.
 See also [`Metalhead.mlpmixer`](#).
 """
 function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16),
-              imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000)
-  @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
-  depth = mixer_configs[size][:depth]
-  embedplanes = mixer_configs[size][:planes]
-  layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
-                    patch_size, embedplanes, drop_path_rate, depth, nclasses)
-
-  gMLP(layers)
+              imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000)
+    @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))"
+    depth = mixer_configs[size][:depth]
+    embedplanes = mixer_configs[size][:planes]
+    layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block,
+                      patch_size, embedplanes, drop_path_rate, depth, nclasses)
+    gMLP(layers)
 end
 
 @functor gMLP
diff --git a/src/pretrain.jl b/src/pretrain.jl
index 97ab7398e..24e6d176d 100644
--- a/src/pretrain.jl
+++ b/src/pretrain.jl
@@ -4,17 +4,17 @@
 Load the pre-trained weights for `model` using the stored artifacts.
 """
 function weights(model)
-  try
-    path = joinpath(@artifact_str(model), "$model.bson")
-    artifact = BSON.load(path, @__MODULE__)
-    if haskey(artifact, :model)
-      return artifact[:model]
-    else
-      throw(ArgumentError("No pre-trained weights available for $model."))
+    try
+        path = joinpath(@artifact_str(model), "$model.bson")
+        artifact = BSON.load(path, @__MODULE__)
+        if haskey(artifact, :model)
+            return artifact[:model]
+        else
+            throw(ArgumentError("No pre-trained weights available for $model."))
+        end
+    catch e
+        throw(ArgumentError("No pre-trained weights available for $model."))
     end
-  catch e
-    throw(ArgumentError("No pre-trained weights available for $model."))
-  end
 end
 
 """
diff --git a/src/utilities.jl b/src/utilities.jl
index 39dbdd3b2..6adc1ec87 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -4,9 +4,9 @@ seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2)
 # utility function for making sure that all layers have a channel size divisible by 8
 # used by MobileNet variants
 function _round_channels(channels, divisor, min_value = divisor)
-  new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
-  # Make sure that round down does not go down by more than 10%
-  return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
+    new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor)
+    # Make sure that round down does not go down by more than 10%
+    return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels
 end
 
 """
@@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm)
 
 # Utility function for pretty printing large models
 function _maybe_big_show(io, model)
-  if isdefined(Flux, :_big_show)
-    if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
-      Flux._big_show(io, model)
-    else
-      show(io, model)
+    if isdefined(Flux, :_big_show)
+        if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL
+            Flux._big_show(io, model)
+        else
+            show(io, model)
+        end
     end
-  end
 end
diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 55b3e3d30..547ca1612 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -11,13 +11,15 @@ Transformer as used in the base ViT architecture.
 - `mlp_ratio`: ratio of MLP layers to the number of input channels
 - `dropout`: dropout rate
 """
-function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.)
-  layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout,
-                                                             proj_drop = dropout)), +),
-                  SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes);
-                                                           dropout)), +))
-            for _ in 1:depth]
-  Chain(layers)
+function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0)
+    layers = [Chain(SkipConnection(prenorm(planes,
+                                           MHAttention(planes, nheads; attn_drop = dropout,
+                                                       proj_drop = dropout)), +),
+                    SkipConnection(prenorm(planes,
+                                           mlp_block(planes, floor(Int, mlp_ratio * planes);
+                                                     dropout)), +))
+              for _ in 1:depth]
+    Chain(layers)
 end
 
 """
@@ -62,8 +64,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
                    :base => (depth = 12, embedplanes = 768, nheads = 12),
                    :large => (depth = 24, embedplanes = 1024, nheads = 16),
                    :huge => (depth = 32, embedplanes = 1280, nheads = 16),
-                   :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11),
-                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13))
+                   :giant => (depth = 40, embedplanes = 1408, nheads = 16,
+                              mlp_ratio = 48 / 11),
+                   :gigantic => (depth = 48, embedplanes = 1664, nheads = 16,
+                                 mlp_ratio = 64 / 13))
 
 """
     ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3,
@@ -83,16 +87,16 @@ Creates a Vision Transformer (ViT) model.
 See also [`Metalhead.vit`](#).
 """
 struct ViT
-  layers
+    layers::Any
 end
 
 function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3,
              patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000)
-  @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
-  kwargs = vit_configs[mode]
-  layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
+    @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))"
+    kwargs = vit_configs[mode]
+    layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...)
 
-  ViT(layers)
+    ViT(layers)
 end
 
 (m::ViT)(x) = m.layers(x)
diff --git a/test/convnets.jl b/test/convnets.jl
index 3540c3e9f..f62ecc3fd 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -5,202 +5,192 @@ using Flux
 PRETRAINED_MODELS = []
 
 @testset "AlexNet" begin
-  model = AlexNet()
-  @test size(model(x_256)) == (1000, 1)
-  @test_throws ArgumentError AlexNet(pretrain = true)
-  @test gradtest(model, x_256)
+    model = AlexNet()
+    @test size(model(x_256)) == (1000, 1)
+    @test_throws ArgumentError AlexNet(pretrain = true)
+    @test gradtest(model, x_256)
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "VGG" begin
-  @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false]
+@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19],
+                                                            bn in [true, false]
+
     m = VGG(sz, batchnorm = bn)
 
     @test size(m(x_224)) == (1000, 1)
     if (VGG, sz, bn) in PRETRAINED_MODELS
-      @test (VGG(sz, batchnorm = bn, pretrain = true); true)
+        @test (VGG(sz, batchnorm = bn, pretrain = true); true)
     else
-      @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+        @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
 @testset "ResNet" begin
-  @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
-    m = ResNet(sz)
-
-    @test size(m(x_256)) == (1000, 1)
-    if (ResNet, sz) in PRETRAINED_MODELS
-      @test (ResNet(sz, pretrain = true); true)
-    else
-      @test_throws ArgumentError ResNet(sz, pretrain = true)
+    @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
+        m = ResNet(sz)
+
+        @test size(m(x_256)) == (1000, 1)
+        if (ResNet, sz) in PRETRAINED_MODELS
+            @test (ResNet(sz, pretrain = true); true)
+        else
+            @test_throws ArgumentError ResNet(sz, pretrain = true)
+        end
+        @test gradtest(m, x_256)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_256)
-    GC.safepoint()
-    GC.gc()
-  end
 
-  @testset "Shortcut C" begin
-    m = Metalhead.resnet(Metalhead.basicblock, :C;
-                         channel_config = [1, 1],
-                         block_config = [2, 2, 2, 2])
+    @testset "Shortcut C" begin
+        m = Metalhead.resnet(Metalhead.basicblock, :C;
+                             channel_config = [1, 1],
+                             block_config = [2, 2, 2, 2])
 
-    @test size(m(x_256)) == (1000, 1)
-    @test gradtest(m, x_256)
-  end
+        @test size(m(x_256)) == (1000, 1)
+        @test gradtest(m, x_256)
+    end
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ResNeXt" begin
-  @testset for depth in [50, 101, 152]
+@testset "ResNeXt" begin @testset for depth in [50, 101, 152]
     m = ResNeXt(depth)
 
     @test size(m(x_224)) == (1000, 1)
     if ResNeXt in PRETRAINED_MODELS
-      @test (ResNeXt(depth, pretrain = true); true)
+        @test (ResNeXt(depth, pretrain = true); true)
     else
-      @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+        @test_throws ArgumentError ResNeXt(depth, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
 @testset "GoogLeNet" begin
-  m = GoogLeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = GoogLeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (GoogLeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "Inception3" begin
-  m = Inception3()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError Inception3(pretrain = true)
-  @test gradtest(m, x_224)
+    m = Inception3()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError Inception3(pretrain = true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
 @testset "SqueezeNet" begin
-  m = SqueezeNet()
-  @test size(m(x_224)) == (1000, 1)
-  @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
-  @test gradtest(m, x_224)
+    m = SqueezeNet()
+    @test size(m(x_224)) == (1000, 1)
+    @test_throws ArgumentError (SqueezeNet(pretrain = true); true)
+    @test gradtest(m, x_224)
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "DenseNet" begin
-  @testset for sz in [121, 161, 169, 201]
+@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201]
     m = DenseNet(sz)
 
     @test size(m(x_224)) == (1000, 1)
     if (DenseNet, sz) in PRETRAINED_MODELS
-      @test (DenseNet(sz, pretrain = true); true)
+        @test (DenseNet(sz, pretrain = true); true)
     else
-      @test_throws ArgumentError DenseNet(sz, pretrain = true)
+        @test_throws ArgumentError DenseNet(sz, pretrain = true)
     end
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
-@testset "MobileNet" verbose = true begin
-  @testset "MobileNetv1" begin
-    m = MobileNetv1()
-
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv1 in PRETRAINED_MODELS
-      @test (MobileNetv1(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv1(pretrain = true)
+@testset "MobileNet" verbose=true begin
+    @testset "MobileNetv1" begin
+        m = MobileNetv1()
+
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv1 in PRETRAINED_MODELS
+            @test (MobileNetv1(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv1(pretrain = true)
+        end
+        @test gradtest(m, x_224)
     end
-    @test gradtest(m, x_224)
-  end
 
-  GC.safepoint()
-  GC.gc()
+    GC.safepoint()
+    GC.gc()
 
-  @testset "MobileNetv2" begin
-    m = MobileNetv2()
+    @testset "MobileNetv2" begin
+        m = MobileNetv2()
 
-    @test size(m(x_224)) == (1000, 1)
-    if MobileNetv2 in PRETRAINED_MODELS
-      @test (MobileNetv2(pretrain = true); true)
-    else
-      @test_throws ArgumentError MobileNetv2(pretrain = true)
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv2 in PRETRAINED_MODELS
+            @test (MobileNetv2(pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv2(pretrain = true)
+        end
+        @test gradtest(m, x_224)
     end
-    @test gradtest(m, x_224)
-  end
-
-  GC.safepoint()
-  GC.gc()
-
-  @testset "MobileNetv3" verbose = true begin
-    @testset for mode in [:small, :large]
-      m = MobileNetv3(mode)
-
-      @test size(m(x_224)) == (1000, 1)
-      if MobileNetv3 in PRETRAINED_MODELS
-        @test (MobileNetv3(mode; pretrain = true); true)
-      else
-        @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
-      end
-      @test gradtest(m, x_224)
-    end
-  end
+
+    GC.safepoint()
+    GC.gc()
+
+    @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large]
+        m = MobileNetv3(mode)
+
+        @test size(m(x_224)) == (1000, 1)
+        if MobileNetv3 in PRETRAINED_MODELS
+            @test (MobileNetv3(mode; pretrain = true); true)
+        else
+            @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+        end
+        @test gradtest(m, x_224)
+    end end
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvNeXt" verbose = true begin
-  @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
+@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = ConvNeXt(mode; drop_path_rate)
+        m = ConvNeXt(mode; drop_path_rate)
 
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvMixer" verbose = true begin
-  @testset for mode in [:small, :base, :large]
+@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large]
     m = ConvMixer(mode)
 
     @test size(m(x_224)) == (1000, 1)
     @test gradtest(m, x_224)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end
diff --git a/test/other.jl b/test/other.jl
index 0162bc4bc..db0bf223c 100644
--- a/test/other.jl
+++ b/test/other.jl
@@ -1,38 +1,32 @@
 using Metalhead, Test
 using Flux
 
-@testset "MLPMixer" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = MLPMixer(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = MLPMixer(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
-@testset "ResMLP" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = ResMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = ResMLP(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
 
-@testset "gMLP" begin
-  @testset for mode in [:small, :base, :large] # :huge]
+@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge]
     @testset for drop_path_rate in [0.0, 0.5]
-      m = gMLP(mode; drop_path_rate)
-      @test size(m(x_224)) == (1000, 1)
-      @test gradtest(m, x_224)
-      GC.safepoint()
-      GC.gc()
+        m = gMLP(mode; drop_path_rate)
+        @test size(m(x_224)) == (1000, 1)
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-  end
-end
+end end
diff --git a/test/runtests.jl b/test/runtests.jl
index 6dd4a1aa4..61af837a7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,33 +3,27 @@ using Flux
 using Flux: Zygote
 
 function gradtest(model, input)
-  y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
-  gs = pb(ones(Float32, size(y)))
+    y, pb = Zygote.pullback(() -> model(input), Flux.params(model))
+    gs = pb(ones(Float32, size(y)))
 
-  # if we make it to here with no error, success!
-  return true
+    # if we make it to here with no error, success!
+    return true
 end
 
 x_224 = rand(Float32, 224, 224, 3, 1)
 x_256 = rand(Float32, 256, 256, 3, 1)
 
 # CNN tests
-@testset verbose = true "ConvNets" begin
-  include("convnets.jl")
-end
+@testset verbose=true "ConvNets" begin include("convnets.jl") end
 
 GC.safepoint()
 GC.gc()
 
 # Other tests
-@testset verbose = true "Other" begin
-  include("other.jl")
-end
+@testset verbose=true "Other" begin include("other.jl") end
 
 GC.safepoint()
 GC.gc()
 
 # ViT tests
-@testset verbose = true "ViTs" begin
-  include("vit-based.jl")
-end
+@testset verbose=true "ViTs" begin include("vit-based.jl") end
diff --git a/test/vit-based.jl b/test/vit-based.jl
index 20b6ecb86..ebd1a0fc2 100644
--- a/test/vit-based.jl
+++ b/test/vit-based.jl
@@ -1,12 +1,10 @@
 using Metalhead, Test
 using Flux
 
-@testset "ViT" begin
-  for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
+@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
     m = ViT(mode)
     @test size(m(x_256)) == (1000, 1)
     @test gradtest(m, x_256)
     GC.safepoint()
     GC.gc()
-  end
-end
+end end

From cb3cd285b6d0e7e66730239866078f6c5c85a11c Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Mon, 30 May 2022 06:20:20 +0530
Subject: [PATCH 6/8] Create .git-blame-ignore-revs

---
 .git-blame-ignore-revs | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .git-blame-ignore-revs

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..d62e45914
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,3 @@
+# .git-blame-ignore-revs
+# Switched to SciML style for code
+fd2869f57c66fa650547cd8581feeba9eda08b88

From e4209fca4d35b725983f424158c650cea8948238 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 11 Jun 2022 14:24:49 +0530
Subject: [PATCH 7/8] Fix formatting for tests

---
 src/vit-based/vit.jl |  21 ++++---
 test/convnets.jl     | 127 +++++++++++++++++++++----------------------
 test/other.jl        |  54 ++++++++++--------
 test/runtests.jl     |  12 +++-
 test/vit-based.jl    |  16 +++---
 5 files changed, 121 insertions(+), 109 deletions(-)

diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl
index 547ca1612..1ebce1bbe 100644
--- a/src/vit-based/vit.jl
+++ b/src/vit-based/vit.jl
@@ -46,17 +46,16 @@ Creates a Vision Transformer (ViT) model.
 function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16),
              embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1,
              emb_dropout = 0.1, pool = :class, nclasses = 1000)
-
-  @assert pool in [:class, :mean]
-  "Pool type must be either :class (class token) or :mean (mean pooling)"
-  npatches = prod(imsize .÷ patch_size)
-  return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
-                     ClassTokens(embedplanes),
-                     ViPosEmbedding(embedplanes, npatches + 1),
-                     Dropout(emb_dropout),
-                     transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
-                     (pool == :class) ? x -> x[:, 1, :] : seconddimmean),
-               Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
+    @assert pool in [:class, :mean]
+    "Pool type must be either :class (class token) or :mean (mean pooling)"
+    npatches = prod(imsize .÷ patch_size)
+    return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes),
+                       ClassTokens(embedplanes),
+                       ViPosEmbedding(embedplanes, npatches + 1),
+                       Dropout(emb_dropout),
+                       transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout),
+                       (pool == :class) ? x -> x[:, 1, :] : seconddimmean),
+                 Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast)))
 end
 
 vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3),
diff --git a/test/convnets.jl b/test/convnets.jl
index f62ecc3fd..7be6d70bc 100644
--- a/test/convnets.jl
+++ b/test/convnets.jl
@@ -14,21 +14,20 @@ end
 GC.safepoint()
 GC.gc()
 
-@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19],
-                                                            bn in [true, false]
-
-    m = VGG(sz, batchnorm = bn)
-
-    @test size(m(x_224)) == (1000, 1)
-    if (VGG, sz, bn) in PRETRAINED_MODELS
-        @test (VGG(sz, batchnorm = bn, pretrain = true); true)
-    else
-        @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+@testset "VGG" begin 
+    @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false]
+        m = VGG(sz, batchnorm = bn)
+        @test size(m(x_224)) == (1000, 1)
+        if (VGG, sz, bn) in PRETRAINED_MODELS
+            @test (VGG(sz, batchnorm = bn, pretrain = true); true)
+        else
+            @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-end end
+end
 
 GC.safepoint()
 GC.gc()
@@ -36,7 +35,6 @@ GC.gc()
 @testset "ResNet" begin
     @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152]
         m = ResNet(sz)
-
         @test size(m(x_256)) == (1000, 1)
         if (ResNet, sz) in PRETRAINED_MODELS
             @test (ResNet(sz, pretrain = true); true)
@@ -52,7 +50,6 @@ GC.gc()
         m = Metalhead.resnet(Metalhead.basicblock, :C;
                              channel_config = [1, 1],
                              block_config = [2, 2, 2, 2])
-
         @test size(m(x_256)) == (1000, 1)
         @test gradtest(m, x_256)
     end
@@ -61,19 +58,20 @@ end
 GC.safepoint()
 GC.gc()
 
-@testset "ResNeXt" begin @testset for depth in [50, 101, 152]
-    m = ResNeXt(depth)
-
-    @test size(m(x_224)) == (1000, 1)
-    if ResNeXt in PRETRAINED_MODELS
-        @test (ResNeXt(depth, pretrain = true); true)
-    else
-        @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+@testset "ResNeXt" begin 
+    @testset for depth in [50, 101, 152]
+        m = ResNeXt(depth)
+        @test size(m(x_224)) == (1000, 1)
+        if ResNeXt in PRETRAINED_MODELS
+            @test (ResNeXt(depth, pretrain = true); true)
+        else
+            @test_throws ArgumentError ResNeXt(depth, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-end end
+end
 
 GC.safepoint()
 GC.gc()
@@ -108,19 +106,20 @@ end
 GC.safepoint()
 GC.gc()
 
-@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201]
-    m = DenseNet(sz)
-
-    @test size(m(x_224)) == (1000, 1)
-    if (DenseNet, sz) in PRETRAINED_MODELS
-        @test (DenseNet(sz, pretrain = true); true)
-    else
-        @test_throws ArgumentError DenseNet(sz, pretrain = true)
+@testset "DenseNet" begin 
+    @testset for sz in [121, 161, 169, 201]
+        m = DenseNet(sz)
+        @test size(m(x_224)) == (1000, 1)
+        if (DenseNet, sz) in PRETRAINED_MODELS
+            @test (DenseNet(sz, pretrain = true); true)
+        else
+            @test_throws ArgumentError DenseNet(sz, pretrain = true)
+        end
+        @test gradtest(m, x_224)
+        GC.safepoint()
+        GC.gc()
     end
-    @test gradtest(m, x_224)
-    GC.safepoint()
-    GC.gc()
-end end
+end
 
 GC.safepoint()
 GC.gc()
@@ -128,7 +127,6 @@ GC.gc()
 @testset "MobileNet" verbose=true begin
     @testset "MobileNetv1" begin
         m = MobileNetv1()
-
         @test size(m(x_224)) == (1000, 1)
         if MobileNetv1 in PRETRAINED_MODELS
             @test (MobileNetv1(pretrain = true); true)
@@ -143,7 +141,6 @@ GC.gc()
 
     @testset "MobileNetv2" begin
         m = MobileNetv2()
-
         @test size(m(x_224)) == (1000, 1)
         if MobileNetv2 in PRETRAINED_MODELS
             @test (MobileNetv2(pretrain = true); true)
@@ -156,39 +153,41 @@ GC.gc()
     GC.safepoint()
     GC.gc()
 
-    @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large]
-        m = MobileNetv3(mode)
-
-        @test size(m(x_224)) == (1000, 1)
-        if MobileNetv3 in PRETRAINED_MODELS
-            @test (MobileNetv3(mode; pretrain = true); true)
-        else
-            @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+    @testset "MobileNetv3" verbose=true begin
+        @testset for mode in [:small, :large]
+            m = MobileNetv3(mode)
+            @test size(m(x_224)) == (1000, 1)
+            if MobileNetv3 in PRETRAINED_MODELS
+                @test (MobileNetv3(mode; pretrain = true); true)
+            else
+                @test_throws ArgumentError MobileNetv3(mode; pretrain = true)
+            end
+            @test gradtest(m, x_224)
         end
-        @test gradtest(m, x_224)
-    end end
+    end
 end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
-    @testset for drop_path_rate in [0.0, 0.5]
-        m = ConvNeXt(mode; drop_path_rate)
-
-        @test size(m(x_224)) == (1000, 1)
-        @test gradtest(m, x_224)
-        GC.safepoint()
-        GC.gc()
-    end
-end end
+@testset "ConvNeXt" verbose=true begin
+    @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge]
+        @testset for drop_path_rate in [0.0, 0.5]
+            m = ConvNeXt(mode; drop_path_rate)
+            @test size(m(x_224)) == (1000, 1)
+            @test gradtest(m, x_224)
+            GC.safepoint()
+            GC.gc()
+        end
+    end 
+end
 
 GC.safepoint()
 GC.gc()
 
-@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large]
+@testset "ConvMixer" verbose=true begin
+    @testset for mode in [:small, :base, :large]
     m = ConvMixer(mode)
-
     @test size(m(x_224)) == (1000, 1)
     @test gradtest(m, x_224)
     GC.safepoint()
diff --git a/test/other.jl b/test/other.jl
index db0bf223c..0d3727f05 100644
--- a/test/other.jl
+++ b/test/other.jl
@@ -1,32 +1,38 @@
 using Metalhead, Test
 using Flux
 
-@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-        m = MLPMixer(mode; drop_path_rate)
-        @test size(m(x_224)) == (1000, 1)
-        @test gradtest(m, x_224)
-        GC.safepoint()
-        GC.gc()
+@testset "MLPMixer" begin
+    @testset for mode in [:small, :base, :large] # :huge]
+        @testset for drop_path_rate in [0.0, 0.5]
+            m = MLPMixer(mode; drop_path_rate)
+            @test size(m(x_224)) == (1000, 1)
+            @test gradtest(m, x_224)
+            GC.safepoint()
+            GC.gc()
+        end
     end
-end end
+end
 
-@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-        m = ResMLP(mode; drop_path_rate)
-        @test size(m(x_224)) == (1000, 1)
-        @test gradtest(m, x_224)
-        GC.safepoint()
-        GC.gc()
+@testset "ResMLP" begin
+    @testset for mode in [:small, :base, :large] # :huge]
+        @testset for drop_path_rate in [0.0, 0.5]
+            m = ResMLP(mode; drop_path_rate)
+            @test size(m(x_224)) == (1000, 1)
+            @test gradtest(m, x_224)
+            GC.safepoint()
+            GC.gc()
+        end
     end
-end end
+end
 
-@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge]
-    @testset for drop_path_rate in [0.0, 0.5]
-        m = gMLP(mode; drop_path_rate)
-        @test size(m(x_224)) == (1000, 1)
-        @test gradtest(m, x_224)
-        GC.safepoint()
-        GC.gc()
+@testset "gMLP" begin
+    @testset for mode in [:small, :base, :large] # :huge]
+        @testset for drop_path_rate in [0.0, 0.5]
+            m = gMLP(mode; drop_path_rate)
+            @test size(m(x_224)) == (1000, 1)
+            @test gradtest(m, x_224)
+            GC.safepoint()
+            GC.gc()
+        end
     end
-end end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 61af837a7..79841244c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,16 +14,22 @@ x_224 = rand(Float32, 224, 224, 3, 1)
 x_256 = rand(Float32, 256, 256, 3, 1)
 
 # CNN tests
-@testset verbose=true "ConvNets" begin include("convnets.jl") end
+@testset verbose=true "ConvNets" begin
+    include("convnets.jl")
+end
 
 GC.safepoint()
 GC.gc()
 
 # Other tests
-@testset verbose=true "Other" begin include("other.jl") end
+@testset verbose=true "Other" begin
+    include("other.jl") 
+end
 
 GC.safepoint()
 GC.gc()
 
 # ViT tests
-@testset verbose=true "ViTs" begin include("vit-based.jl") end
+@testset verbose=true "ViTs" begin
+    include("vit-based.jl") 
+end
diff --git a/test/vit-based.jl b/test/vit-based.jl
index ebd1a0fc2..cdaffc430 100644
--- a/test/vit-based.jl
+++ b/test/vit-based.jl
@@ -1,10 +1,12 @@
 using Metalhead, Test
 using Flux
 
-@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
-    m = ViT(mode)
-    @test size(m(x_256)) == (1000, 1)
-    @test gradtest(m, x_256)
-    GC.safepoint()
-    GC.gc()
-end end
+@testset "ViT" begin
+    for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic]
+        m = ViT(mode)
+        @test size(m(x_256)) == (1000, 1)
+        @test gradtest(m, x_256)
+        GC.safepoint()
+        GC.gc()
+    end
+end

From 562f61d690ec34a1ea97bc408bd0c8db81684a28 Mon Sep 17 00:00:00 2001
From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com>
Date: Sat, 11 Jun 2022 14:29:21 +0530
Subject: [PATCH 8/8] Bump version to generate dev docs without error

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index c010c513d..69f35b397 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metalhead"
 uuid = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
-version = "0.7.1"
+version = "0.7.1-DEV"
 
 [deps]
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"