From fd2869f57c66fa650547cd8581feeba9eda08b88 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Mon, 30 May 2022 06:19:37 +0530 Subject: [PATCH 1/8] Switch to SciML style for code --- .JuliaFormatter.toml | 2 + src/Metalhead.jl | 25 ++-- src/convnets/alexnet.jl | 43 +++--- src/convnets/convmixer.jl | 39 +++--- src/convnets/convnext.jl | 101 +++++++------- src/convnets/densenet.jl | 82 ++++++------ src/convnets/googlenet.jl | 60 ++++----- src/convnets/inception.jl | 191 ++++++++++++--------------- src/convnets/mobilenet.jl | 263 ++++++++++++++++++------------------- src/convnets/resnet.jl | 121 ++++++++--------- src/convnets/resnext.jl | 94 ++++++------- src/convnets/squeezenet.jl | 57 ++++---- src/convnets/vgg.jl | 112 ++++++++-------- src/layers/attention.jl | 52 ++++---- src/layers/conv.jl | 134 ++++++++++--------- src/layers/embeddings.jl | 31 +++-- src/layers/mlp.jl | 22 ++-- src/layers/normalise.jl | 12 +- src/layers/others.jl | 5 +- src/other/mlpmixer.jl | 155 +++++++++++----------- src/pretrain.jl | 20 +-- src/utilities.jl | 18 +-- src/vit-based/vit.jl | 53 ++++---- test/convnets.jl | 200 ++++++++++++++-------------- test/other.jl | 48 +++---- test/runtests.jl | 20 +-- test/vit-based.jl | 6 +- 27 files changed, 972 insertions(+), 994 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 000000000..93a9e7665 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +style = "sciml" +whitespace_in_kwargs = true diff --git a/src/Metalhead.jl b/src/Metalhead.jl index a0fb3785a..e465b6981 100644 --- a/src/Metalhead.jl +++ b/src/Metalhead.jl @@ -37,22 +37,23 @@ include("vit-based/vit.jl") include("pretrain.jl") -export AlexNet, - VGG, VGG11, VGG13, VGG16, VGG19, - ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, - GoogLeNet, Inception3, SqueezeNet, - DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, - ResNeXt, - MobileNetv1, MobileNetv2, MobileNetv3, - MLPMixer, ResMLP, gMLP, - ViT, - ConvNeXt, ConvMixer +export AlexNet, + VGG, VGG11, VGG13, VGG16, VGG19, + ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, + GoogLeNet, Inception3, SqueezeNet, + DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, + ResNeXt, + MobileNetv1, MobileNetv2, MobileNetv3, + MLPMixer, ResMLP, gMLP, + ViT, + ConvNeXt, ConvMixer # use Flux._big_show to pretty print large models -for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, +for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, + :ResNeXt, :MobileNetv1, :MobileNetv2, :MobileNetv3, :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer) - @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) + @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) end end # module diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl index ea3962c2a..93bf1cd67 100644 --- a/src/convnets/alexnet.jl +++ b/src/convnets/alexnet.jl @@ -8,23 +8,23 @@ Create an AlexNet model - `nclasses`: the number of output classes """ function alexnet(; nclasses = 1000) - layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((5, 5), 64 => 192, relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((3, 3), 192 => 384, relu, pad = (1, 1)), - Conv((3, 3), 384 => 256, relu, pad = (1, 1)), - Conv((3, 3), 256 => 256, relu, pad = (1, 1)), - MaxPool((3, 3), stride = (2, 2)), - AdaptiveMeanPool((6,6))), - Chain(MLUtils.flatten, - Dropout(0.5), - Dense(256 * 6 * 6, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dense(4096, nclasses))) - - return layers + layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)), + MaxPool((3, 3), stride = (2, 2)), + Conv((5, 5), 64 => 192, relu, pad = (2, 2)), + MaxPool((3, 3), stride = (2, 2)), + Conv((3, 3), 192 => 384, relu, pad = (1, 1)), + Conv((3, 3), 384 => 256, relu, pad = (1, 1)), + Conv((3, 3), 256 => 256, relu, pad = (1, 1)), + MaxPool((3, 3), stride = (2, 2)), + AdaptiveMeanPool((6, 6))), + Chain(MLUtils.flatten, + Dropout(0.5), + Dense(256 * 6 * 6, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dense(4096, nclasses))) + + return layers end """ @@ -41,14 +41,13 @@ See also [`alexnet`](#). - `nclasses`: the number of output classes """ struct AlexNet - layers + layers::Any end function AlexNet(; pretrain = false, nclasses = 1000) - layers = alexnet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "AlexNet") - - AlexNet(layers) + layers = alexnet(nclasses = nclasses) + pretrain && loadpretrain!(layers, "AlexNet") + AlexNet(layers) end @functor AlexNet diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl index 01a6e61be..2a6aeae05 100644 --- a/src/convnets/convmixer.jl +++ b/src/convnets/convmixer.jl @@ -16,20 +16,24 @@ Creates a ConvMixer model. """ function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000) - stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1]) - blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; - preact = true, groups = planes, pad = SamePad())), +), - conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth] - head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) - return Chain(Chain(stem..., Chain(blocks)), head) + stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, + stride = patch_size[1]) + blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; + preact = true, groups = planes, + pad = SamePad())), +), + conv_bn((1, 1), planes, planes, activation; preact = true)...) + for _ in 1:depth] + head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) + return Chain(Chain(stem..., Chain(blocks)), head) end convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7)), + :patch_size => (7, 7)), :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7), - :patch_size => (7, 7)), - :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7))) + :patch_size => (7, 7)), + :large => Dict(:planes => 1024, :depth => 20, + :kernel_size => (9, 9), + :patch_size => (7, 7))) """ ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) @@ -44,16 +48,17 @@ Creates a ConvMixer model. - `nclasses`: number of classes in the output """ struct ConvMixer - layers + layers::Any end function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) - planes = convmixer_config[mode][:planes] - depth = convmixer_config[mode][:depth] - kernel_size = convmixer_config[mode][:kernel_size] - patch_size = convmixer_config[mode][:patch_size] - layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses) - return ConvMixer(layers) + planes = convmixer_config[mode][:planes] + depth = convmixer_config[mode][:depth] + kernel_size = convmixer_config[mode][:kernel_size] + patch_size = convmixer_config[mode][:patch_size] + layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, + nclasses) + return ConvMixer(layers) end @functor ConvMixer diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl index 1621803bf..0a44e7482 100644 --- a/src/convnets/convnext.jl +++ b/src/convnets/convnext.jl @@ -9,15 +9,15 @@ Creates a single block of ConvNeXt. - `drop_path_rate`: Stochastic depth rate. - `λ`: Init value for LayerScale """ -function convnextblock(planes, drop_path_rate = 0., λ = 1f-6) - layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), - swapdims((3, 1, 2, 4)), - LayerNorm(planes; ϵ = 1f-6), - mlp_block(planes, 4 * planes), - LayerScale(planes, λ), - swapdims((2, 3, 1, 4)), - DropPath(drop_path_rate)), +) - return layers +function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6) + layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), + swapdims((3, 1, 2, 4)), + LayerNorm(planes; ϵ = 1.0f-6), + mlp_block(planes, 4 * planes), + LayerScale(planes, λ), + swapdims((2, 3, 1, 4)), + DropPath(drop_path_rate)), +) + return layers end """ @@ -34,45 +34,48 @@ Creates the layers for a ConvNeXt model. - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) - `nclasses`: number of output classes """ -function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000) - @assert length(depths) == length(planes) "`planes` should have exactly one value for each block" - - downsample_layers = [] - stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), - ChannelLayerNorm(planes[1]; ϵ = 1f-6)) - push!(downsample_layers, stem) - for m in 1:length(depths) - 1 - downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6), - Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) - push!(downsample_layers, downsample_layer) - end - - stages = [] - dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths)) - cur = 0 - for i in 1:length(depths) - push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) - cur += depths[i] - end - - backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) - head = Chain(GlobalMeanPool(), - MLUtils.flatten, - LayerNorm(planes[end]), - Dense(planes[end], nclasses)) - - return Chain(Chain(backbone), head) +function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, + nclasses = 1000) + @assert length(depths)==length(planes) "`planes` should have exactly one value for each block" + downsample_layers = [] + stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), + ChannelLayerNorm(planes[1]; ϵ = 1.0f-6)) + push!(downsample_layers, stem) + for m in 1:(length(depths) - 1) + downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6), + Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) + push!(downsample_layers, downsample_layer) + end + stages = [] + dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths)) + cur = 0 + for i in 1:length(depths) + push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) + cur += depths[i] + end + backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) + head = Chain(GlobalMeanPool(), + MLUtils.flatten, + LayerNorm(planes[end]), + Dense(planes[end], nclasses)) + + return Chain(Chain(backbone), head) end # Configurations for ConvNeXt models -convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]), - :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]), - :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]), - :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]), - :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048])) +convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], + :planes => [96, 192, 384, 768]), + :small => Dict(:depths => [3, 3, 27, 3], + :planes => [96, 192, 384, 768]), + :base => Dict(:depths => [3, 3, 27, 3], + :planes => [128, 256, 512, 1024]), + :large => Dict(:depths => [3, 3, 27, 3], + :planes => [192, 384, 768, 1536]), + :xlarge => Dict(:depths => [3, 3, 27, 3], + :planes => [256, 512, 1024, 2048])) struct ConvNeXt - layers + layers::Any end """ @@ -89,13 +92,13 @@ Creates a ConvNeXt model. See also [`Metalhead.convnext`](#). """ -function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, +function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, nclasses = 1000) - @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" - depths = convnext_configs[mode][:depths] - planes = convnext_configs[mode][:planes] - layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) - return ConvNeXt(layers) + @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" + depths = convnext_configs[mode][:depths] + planes = convnext_configs[mode][:planes] + layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) + return ConvNeXt(layers) end (m::ConvNeXt)(x) = m.layers(x) diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl index bda7a321d..be98509e6 100644 --- a/src/convnets/densenet.jl +++ b/src/convnets/densenet.jl @@ -10,11 +10,12 @@ Create a Densenet bottleneck layer (and scaling factor for inner feature maps; see ref) """ function dense_bottleneck(inplanes, outplanes) - inner_channels = 4 * outplanes - m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., - conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...) + inner_channels = 4 * outplanes + m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., + conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, + rev = true)...) - SkipConnection(m, cat_channels) + SkipConnection(m, cat_channels) end """ @@ -27,8 +28,10 @@ Create a DenseNet transition sequence - `inplanes`: number of input feature maps - `outplanes`: number of output feature maps """ -transition(inplanes, outplanes) = - Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2))) +function transition(inplanes, outplanes) + Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., + MeanPool((2, 2))) +end """ dense_block(inplanes, growth_rates) @@ -42,8 +45,10 @@ the number of output feature maps by `growth_rates` with each block - `growth_rates`: the growth (additive) rates of output feature maps after each block (a vector of `k`s from the ref) """ -dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o) - for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)] +function dense_block(inplanes, growth_rates) + [dense_bottleneck(i, o) + for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)] +end """ densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) @@ -59,24 +64,24 @@ Create a DenseNet model - `nclasses`: the number of output classes """ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) - push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1))) - - outplanes = 0 - for (i, rates) in enumerate(growth_rates) - outplanes = inplanes + sum(rates) - append!(layers, dense_block(inplanes, rates)) - (i != length(growth_rates)) && - push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) - inplanes = floor(Int, outplanes * reduction) - end - push!(layers, BatchNorm(outplanes, relu)) - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dense(outplanes, nclasses))) + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) + push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1))) + + outplanes = 0 + for (i, rates) in enumerate(growth_rates) + outplanes = inplanes + sum(rates) + append!(layers, dense_block(inplanes, rates)) + (i != length(growth_rates)) && + push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) + inplanes = floor(Int, outplanes * reduction) + end + push!(layers, BatchNorm(outplanes, relu)) + + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dense(outplanes, nclasses))) end """ @@ -91,9 +96,10 @@ Create a DenseNet model - `reduction`: the factor by which the number of feature maps is scaled across each transition - `nclasses`: the number of output classes """ -densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) = - densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; - reduction = reduction, nclasses = nclasses) +function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) + densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; + reduction = reduction, nclasses = nclasses) +end """ DenseNet(nblocks::NTuple{N, <:Integer}; @@ -110,16 +116,16 @@ See also [`densenet`](#). - `nclasses`: the number of output classes """ struct DenseNet - layers + layers::Any end function DenseNet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N} - layers = densenet(nblocks; growth_rate = growth_rate, - reduction = reduction, - nclasses = nclasses) + layers = densenet(nblocks; growth_rate = growth_rate, + reduction = reduction, + nclasses = nclasses) - DenseNet(layers) + DenseNet(layers) end @functor DenseNet @@ -148,11 +154,11 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.densenet`](#). """ function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000) - @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." - model = DenseNet(densenet_config[config]; nclasses = nclasses) + @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." + model = DenseNet(densenet_config[config]; nclasses = nclasses) - pretrain && loadpretrain!(model, string("DenseNet", config)) - return model + pretrain && loadpretrain!(model, string("DenseNet", config)) + return model end # deprecations diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl index bc42a052f..40dd5ff41 100644 --- a/src/convnets/googlenet.jl +++ b/src/convnets/googlenet.jl @@ -15,16 +15,12 @@ Create an inception module for use in GoogLeNet """ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj) branch1 = Chain(Conv((1, 1), inplanes => out_1x1)) - branch2 = Chain(Conv((1, 1), inplanes => red_3x3), Conv((3, 3), red_3x3 => out_3x3; pad = 1)) - branch3 = Chain(Conv((1, 1), inplanes => red_5x5), - Conv((5, 5), red_5x5 => out_5x5; pad = 2)) - - branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1), + Conv((5, 5), red_5x5 => out_5x5; pad = 2)) + branch4 = Chain(MaxPool((3, 3), stride = 1, pad = 1), Conv((1, 1), inplanes => pool_proj)) - return Parallel(cat_channels, branch1, branch2, branch3, branch4) end @@ -39,28 +35,27 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet) - `nclasses`: the number of output classes """ function googlenet(; nclasses = 1000) - layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), - MaxPool((3, 3), stride = 2, pad = 1), - Conv((1, 1), 64 => 64), - Conv((3, 3), 64 => 192; pad = 1), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(192, 64, 96, 128, 16, 32, 32), - _inceptionblock(256, 128, 128, 192, 32, 96, 64), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(480, 192, 96, 208, 16, 48, 64), - _inceptionblock(512, 160, 112, 224, 24, 64, 64), - _inceptionblock(512, 128, 128, 256, 24, 64, 64), - _inceptionblock(512, 112, 144, 288, 32, 64, 64), - _inceptionblock(528, 256, 160, 320, 32, 128, 128), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(832, 256, 160, 320, 32, 128, 128), - _inceptionblock(832, 384, 192, 384, 48, 128, 128)), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dropout(0.4), - Dense(1024, nclasses))) - - return layers + layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), + MaxPool((3, 3), stride = 2, pad = 1), + Conv((1, 1), 64 => 64), + Conv((3, 3), 64 => 192; pad = 1), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(192, 64, 96, 128, 16, 32, 32), + _inceptionblock(256, 128, 128, 192, 32, 96, 64), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(480, 192, 96, 208, 16, 48, 64), + _inceptionblock(512, 160, 112, 224, 24, 64, 64), + _inceptionblock(512, 128, 128, 256, 24, 64, 64), + _inceptionblock(512, 112, 144, 288, 32, 64, 64), + _inceptionblock(528, 256, 160, 320, 32, 128, 128), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(832, 256, 160, 320, 32, 128, 128), + _inceptionblock(832, 384, 192, 384, 48, 128, 128)), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dropout(0.4), + Dense(1024, nclasses))) + return layers end """ @@ -79,14 +74,13 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`) See also [`googlenet`](#). """ struct GoogLeNet - layers + layers::Any end function GoogLeNet(; pretrain = false, nclasses = 1000) - layers = googlenet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "GoogLeNet") - - GoogLeNet(layers) + layers = googlenet(nclasses = nclasses) + pretrain && loadpretrain!(layers, "GoogLeNet") + GoogLeNet(layers) end @functor GoogLeNet diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl index ef8ab81ef..2673d1b8e 100644 --- a/src/convnets/inception.jl +++ b/src/convnets/inception.jl @@ -9,20 +9,16 @@ Create an Inception-v3 style-A module - `pool_proj`: the number of output feature maps for the pooling projection """ function inception_a(inplanes, pool_proj) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) - - branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., - conv_bn((5, 5), 48, 64; pad = 2)...) - - branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; pad = 1)...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, pool_proj)...) - - return Parallel(cat_channels, - branch1x1, branch5x5, branch3x3, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) + branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., + conv_bn((5, 5), 48, 64; pad = 2)...) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; pad = 1)...) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, pool_proj)...) + return Parallel(cat_channels, + branch1x1, branch5x5, branch3x3, branch_pool) end """ @@ -35,16 +31,13 @@ Create an Inception-v3 style-B module - `inplanes`: number of input feature maps """ function inception_b(inplanes) - branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride = 2) - - return Parallel(cat_channels, - branch3x3_1, branch3x3_2, branch_pool) + branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; stride = 2)...) + branch_pool = MaxPool((3, 3), stride = 2) + return Parallel(cat_channels, + branch3x3_1, branch3x3_2, branch_pool) end """ @@ -59,23 +52,19 @@ Create an Inception-v3 style-C module - `n`: the "grid size" (kernel size) for the convolution layers """ function inception_c(inplanes, inner_planes, n = 7) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) - - branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) - - branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, branch7x7_1, branch7x7_2, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) + branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) + branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, branch7x7_1, branch7x7_2, branch_pool) end """ @@ -88,18 +77,15 @@ Create an Inception-v3 style-D module - `inplanes`: number of input feature maps """ function inception_d(inplanes) - branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((3, 3), 192, 320; stride = 2)...) - - branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((1, 7), 192, 192; pad = (0, 3))..., - conv_bn((7, 1), 192, 192; pad = (3, 0))..., - conv_bn((3, 3), 192, 192; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride=2) - - return Parallel(cat_channels, - branch3x3, branch7x7x3, branch_pool) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((3, 3), 192, 320; stride = 2)...) + branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((1, 7), 192, 192; pad = (0, 3))..., + conv_bn((7, 1), 192, 192; pad = (3, 0))..., + conv_bn((3, 3), 192, 192; stride = 2)...) + branch_pool = MaxPool((3, 3), stride = 2) + return Parallel(cat_channels, + branch3x3, branch7x7x3, branch_pool) end """ @@ -112,30 +98,25 @@ Create an Inception-v3 style-E module - `inplanes`: number of input feature maps """ function inception_e(inplanes) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) - - branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) - branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., - conv_bn((3, 3), 448, 384; pad = 1)...) - branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, - Chain(branch3x3_1, - Parallel(cat_channels, - branch3x3_1a, branch3x3_1b)), - - Chain(branch3x3_2, - Parallel(cat_channels, - branch3x3_2a, branch3x3_2b)), - branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) + branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) + branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., + conv_bn((3, 3), 448, 384; pad = 1)...) + branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, + Chain(branch3x3_1, + Parallel(cat_channels, + branch3x3_1a, branch3x3_1b)), + Chain(branch3x3_2, + Parallel(cat_channels, + branch3x3_2a, branch3x3_2b)), + branch_pool) end """ @@ -150,30 +131,29 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)). `inception3` does not currently support pretrained weights. """ function inception3(; nclasses = 1000) - layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., - conv_bn((3, 3), 32, 32)..., - conv_bn((3, 3), 32, 64; pad = 1)..., - MaxPool((3, 3), stride = 2), - conv_bn((1, 1), 64, 80)..., - conv_bn((3, 3), 80, 192)..., - MaxPool((3, 3), stride = 2), - inception_a(192, 32), - inception_a(256, 64), - inception_a(288, 64), - inception_b(288), - inception_c(768, 128), - inception_c(768, 160), - inception_c(768, 160), - inception_c(768, 192), - inception_d(768), - inception_e(1280), - inception_e(2048)), - Chain(AdaptiveMeanPool((1, 1)), - Dropout(0.2), - MLUtils.flatten, - Dense(2048, nclasses))) - - return layer + layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., + conv_bn((3, 3), 32, 32)..., + conv_bn((3, 3), 32, 64; pad = 1)..., + MaxPool((3, 3), stride = 2), + conv_bn((1, 1), 64, 80)..., + conv_bn((3, 3), 80, 192)..., + MaxPool((3, 3), stride = 2), + inception_a(192, 32), + inception_a(256, 64), + inception_a(288, 64), + inception_b(288), + inception_c(768, 128), + inception_c(768, 160), + inception_c(768, 160), + inception_c(768, 192), + inception_d(768), + inception_e(1280), + inception_e(2048)), + Chain(AdaptiveMeanPool((1, 1)), + Dropout(0.2), + MLUtils.flatten, + Dense(2048, nclasses))) + return layer end """ @@ -190,14 +170,13 @@ See also [`inception3`](#). `Inception3` does not currently support pretrained weights. """ struct Inception3 - layers + layers::Any end function Inception3(; pretrain = false, nclasses = 1000) - layers = inception3(nclasses = nclasses) - pretrain && loadpretrain!(layers, "Inception3") - - Inception3(layers) + layers = inception3(nclasses = nclasses) + pretrain && loadpretrain!(layers, "Inception3") + Inception3(layers) end @functor Inception3 diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl index 2dfd06f8d..fed893142 100644 --- a/src/convnets/mobilenet.jl +++ b/src/convnets/mobilenet.jl @@ -27,37 +27,37 @@ function mobilenetv1(width_mult, config; inchannels = 3, nclasses = 1000, fcsize = 1024) - layers = [] - for (dw, outch, stride, nrepeats) in config - outch = Int(outch * width_mult) - for _ in 1:nrepeats - layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; - stride = stride, pad = 1) : - conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) - append!(layers, layer) - inchannels = outch + layers = [] + for (dw, outch, stride, nrepeats) in config + outch = Int(outch * width_mult) + for _ in 1:nrepeats + layer = dw ? + depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; + stride = stride, pad = 1) : + conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) + append!(layers, layer) + inchannels = outch + end end - end - - return Chain(Chain(layers), - Chain(GlobalMeanPool(), - MLUtils.flatten, - Dense(inchannels, fcsize, activation), - Dense(fcsize, nclasses))) + return Chain(Chain(layers), + Chain(GlobalMeanPool(), + MLUtils.flatten, + Dense(inchannels, fcsize, activation), + Dense(fcsize, nclasses))) end const mobilenetv1_configs = [ -# dw, c, s, r - (false, 32, 2, 1), - ( true, 64, 1, 1), - ( true, 128, 2, 1), - ( true, 128, 1, 1), - ( true, 256, 2, 1), - ( true, 256, 1, 1), - ( true, 512, 2, 1), - ( true, 512, 1, 5), - ( true, 1024, 2, 1), - ( true, 1024, 1, 1) + # dw, c, s, r + (false, 32, 2, 1), + (true, 64, 1, 1), + (true, 128, 2, 1), + (true, 128, 1, 1), + (true, 256, 2, 1), + (true, 256, 1, 1), + (true, 512, 2, 1), + (true, 512, 1, 5), + (true, 1024, 2, 1), + (true, 1024, 1, 1), ] """ @@ -77,14 +77,13 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet. See also [`Metalhead.mobilenetv1`](#). """ struct MobileNetv1 - layers + layers::Any end function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv1")) - - return MobileNetv1(layers) + layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv1")) + return MobileNetv1(layers) end @functor MobileNetv1 @@ -95,7 +94,6 @@ backbone(m::MobileNetv1) = m.layers[1] classifier(m::MobileNetv1) = m.layers[2] # MobileNetv2 - """ mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000) @@ -115,44 +113,45 @@ Create a MobileNetv2 model. - `nclasses`: The number of output classes """ function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000) - # building first layer - inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2)) - - # building inverted residual blocks - for (t, c, n, s, a) in configs - outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) - for i in 1:n - push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a; - stride = i == 1 ? s : 1)) - inplanes = outplanes + # building first layer + inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2)) + # building inverted residual blocks + for (t, c, n, s, a) in configs + outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) + for i in 1:n + push!(layers, + invertedresidual(3, inplanes, inplanes * t, outplanes, a; + stride = i == 1 ? s : 1)) + inplanes = outplanes + end end - end - - # building last several layers - outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : - max_width - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses))) + # building last several layers + outplanes = (width_mult > 1) ? + _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : + max_width + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(outplanes, nclasses))) end # Layer configurations for MobileNetv2 const mobilenetv2_configs = [ -# t, c, n, s, a - (1, 16, 1, 1, relu6), - (6, 24, 2, 2, relu6), - (6, 32, 3, 2, relu6), - (6, 64, 4, 2, relu6), - (6, 96, 3, 1, relu6), - (6, 160, 3, 2, relu6), - (6, 320, 1, 1, relu6) + # t, c, n, s, a + (1, 16, 1, 1, relu6), + (6, 24, 2, 2, relu6), + (6, 32, 3, 2, relu6), + (6, 64, 4, 2, relu6), + (6, 96, 3, 1, relu6), + (6, 160, 3, 2, relu6), + (6, 320, 1, 1, relu6), ] # Model definition for MobileNetv2 struct MobileNetv2 - layers + layers::Any end """ @@ -172,10 +171,9 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet. See also [`Metalhead.mobilenetv2`](#). """ function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv2")) - - MobileNetv2(layers) + layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv2")) + MobileNetv2(layers) end @functor MobileNetv2 @@ -186,7 +184,6 @@ backbone(m::MobileNetv2) = m.layers[1] classifier(m::MobileNetv2) = m.layers[2] # MobileNetv3 - """ mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000) @@ -208,71 +205,70 @@ Create a MobileNetv3 model. - `nclasses`: the number of output classes """ function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000) - # building first layer - inplanes = _round_channels(16 * width_mult, 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) - explanes = 0 - # building inverted residual blocks - for (k, t, c, r, a, s) in configs - # inverted residual layers - outplanes = _round_channels(c * width_mult, 8) - explanes = _round_channels(inplanes * t, 8) - push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a; - stride = s, reduction = r)) - inplanes = outplanes - end - - # building last several layers - output_channel = max_width - output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel - classifier = Chain(Dense(explanes, output_channel, hardswish), - Dropout(0.2), - Dense(output_channel, nclasses)) - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) + # building first layer + inplanes = _round_channels(16 * width_mult, 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) + explanes = 0 + # building inverted residual blocks + for (k, t, c, r, a, s) in configs + # inverted residual layers + outplanes = _round_channels(c * width_mult, 8) + explanes = _round_channels(inplanes * t, 8) + push!(layers, + invertedresidual(k, inplanes, explanes, outplanes, a; + stride = s, reduction = r)) + inplanes = outplanes + end + # building last several layers + output_channel = max_width + output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : + output_channel + classifier = Chain(Dense(explanes, output_channel, hardswish), + Dropout(0.2), + Dense(output_channel, nclasses)) + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) end # Configurations for small and large mode for MobileNetv3 -mobilenetv3_configs = Dict( - :small => [ - # k, t, c, SE, a, s - (3, 1, 16, 4, relu, 2), - (3, 4.5, 24, nothing, relu, 2), - (3, 3.67, 24, nothing, relu, 1), - (5, 4, 40, 4, hardswish, 2), - (5, 6, 40, 4, hardswish, 1), - (5, 6, 40, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 2), - (5, 6, 96, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 1), - ], - :large => [ - # k, t, c, SE, a, s - (3, 1, 16, nothing, relu, 1), - (3, 4, 24, nothing, relu, 2), - (3, 3, 24, nothing, relu, 1), - (5, 3, 40, 4, relu, 2), - (5, 3, 40, 4, relu, 1), - (5, 3, 40, 4, relu, 1), - (3, 6, 80, nothing, hardswish, 2), - (3, 2.5, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 2), - (5, 6, 160, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 1) - ] -) +mobilenetv3_configs = Dict(:small => [ + # k, t, c, SE, a, s + (3, 1, 16, 4, relu, 2), + (3, 4.5, 24, nothing, relu, 2), + (3, 3.67, 24, nothing, relu, 1), + (5, 4, 40, 4, hardswish, 2), + (5, 6, 40, 4, hardswish, 1), + (5, 6, 40, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 2), + (5, 6, 96, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 1), + ], + :large => [ + # k, t, c, SE, a, s + (3, 1, 16, nothing, relu, 1), + (3, 4, 24, nothing, relu, 2), + (3, 3, 24, nothing, relu, 1), + (5, 3, 40, 4, relu, 2), + (5, 3, 40, 4, relu, 1), + (5, 3, 40, 4, relu, 1), + (3, 6, 80, nothing, hardswish, 2), + (3, 2.5, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 2), + (5, 6, 160, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 1), + ]) # Model definition for MobileNetv3 struct MobileNetv3 - layers + layers::Any end """ @@ -292,13 +288,14 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.mobilenetv3`](#). """ -function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000) - @assert mode in [:large, :small] "`mode` has to be either :large or :small" - - max_width = (mode == :large) ? 1280 : 1024 - layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) - MobileNetv3(layers) +function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, + nclasses = 1000) + @assert mode in [:large, :small] "`mode` has to be either :large or :small" + max_width = (mode == :large) ? 1280 : 1024 + layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, + nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) + MobileNetv3(layers) end @functor MobileNetv3 diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl index d91d65d6a..54bb5cb35 100644 --- a/src/convnets/resnet.jl +++ b/src/convnets/resnet.jl @@ -11,9 +11,11 @@ Create a basic residual block - `downsample`: set to `true` to downsample the input """ function basicblock(inplanes, outplanes, downsample = false) - stride = downsample ? 2 : 1 - Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...) + stride = downsample ? 2 : 1 + Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, + bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, + bias = false)...) end """ @@ -36,9 +38,11 @@ The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead. """ function bottleneck(inplanes, outplanes, downsample = false; stride = [1, (downsample ? 2 : 1), 1]) - Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)..., - conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...) + Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, + bias = false)..., + conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], + bias = false)...) end """ @@ -55,8 +59,9 @@ layer which has a stride of 2. within the residual block - `downsample`: set to `true` to downsample the input """ -bottleneck_v1(inplanes, outplanes, downsample = false) = +function bottleneck_v1(inplanes, outplanes, downsample = false) bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1]) +end """ resnet(block, residuals::NTuple{2, Any}, connection = addrelu; @@ -78,31 +83,33 @@ Create a ResNet model """ function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu; channel_config, block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 64 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes .* channel_config - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1), - residuals[i][1](inplanes, outplanes[end], i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes[end] - for _ in 2:nrepeats - push!(layers, Parallel(connection, block(inplanes, outplanes, false), - residuals[i][2](inplanes, outplanes[end], false))) - inplanes = outplanes[end] + inplanes = 64 + baseplanes = 64 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) + push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes .* channel_config + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, block(inplanes, outplanes, i != 1), + residuals[i][1](inplanes, outplanes[end], i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes[end] + for _ in 2:nrepeats + push!(layers, + Parallel(connection, block(inplanes, outplanes, false), + residuals[i][2](inplanes, outplanes[end], false))) + inplanes = outplanes[end] + end + # next set of output plane base is doubled + baseplanes *= 2 end - # next set of output plane base is doubled - baseplanes *= 2 - end - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -126,17 +133,14 @@ Create a ResNet model - `nclasses`: the number of output classes """ function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...) - shortcut_dict = Dict( - :A => (skip_identity, skip_identity), - :B => (skip_projection, skip_identity), - :C => (skip_projection, skip_projection)) - - if any(sc -> !haskey(shortcut_dict,sc),shortcut_config) - error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") - end - - shortcut = [shortcut_dict[sc] for sc in shortcut_config] - resnet(block, shortcut, args...; kwargs...) + shortcut_dict = Dict(:A => (skip_identity, skip_identity), + :B => (skip_projection, skip_identity), + :C => (skip_projection, skip_projection)) + if any(sc -> !haskey(shortcut_dict, sc), shortcut_config) + error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") + end + shortcut = [shortcut_dict[sc] for sc in shortcut_config] + resnet(block, shortcut, args...; kwargs...) end function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...) @@ -144,14 +148,15 @@ function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs... block_config = block_config, kwargs...) end -resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...) +function resnet(block, residuals::NTuple{2}, args...; kwargs...) + resnet(block, [residuals], args...; kwargs...) +end -const resnet_config = - Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), - 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), - 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), - 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), - 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) +const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), + 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), + 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), + 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), + 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) """ ResNet(channel_config, block_config, shortcut_config; @@ -173,19 +178,18 @@ See also [`resnet`](#). - `nclasses`: the number of output classes """ struct ResNet - layers + layers::Any end function ResNet(channel_config, block_config, shortcut_config; block, connection = addrelu, nclasses = 1000) - layers = resnet(block, - shortcut_config, - connection; - channel_config = channel_config, - block_config = block_config, - nclasses = nclasses) - - ResNet(layers) + layers = resnet(block, + shortcut_config, + connection; + channel_config = channel_config, + block_config = block_config, + nclasses = nclasses) + ResNet(layers) end @functor ResNet @@ -238,7 +242,6 @@ resnet50_v1 = ResNet([1, 1, 4], [3, 4, 6, 3], :B; block = Metalhead.bottleneck_v """ function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000) @assert depth in keys(resnet_config) "`depth` must be one of $(sort(collect(keys(resnet_config))))" - config, block = resnet_config[depth] model = ResNet(config...; block = block, nclasses = nclasses) pretrain && loadpretrain!(model, string("ResNet", depth)) diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl index eaa66f98f..41910cb26 100644 --- a/src/convnets/resnext.jl +++ b/src/convnets/resnext.jl @@ -12,12 +12,12 @@ Create a basic residual block as defined in the paper for ResNeXt - `downsample`: set to `true` to downsample the input """ function resnextblock(inplanes, outplanes, cardinality, width, downsample = false) - stride = downsample ? 2 : 1 - hidden_channels = cardinality * width - return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., - conv_bn((3, 3), hidden_channels, hidden_channels; - stride = stride, pad = 1, bias = false, groups = cardinality)..., - conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) + stride = downsample ? 2 : 1 + hidden_channels = cardinality * width + return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., + conv_bn((3, 3), hidden_channels, hidden_channels; + stride = stride, pad = 1, bias = false, groups = cardinality)..., + conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) end """ @@ -35,33 +35,39 @@ Create a ResNeXt model - `block_config`: a list of the number of residual blocks at each stage - `nclasses`: the number of output classes """ -function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y); +function resnext(cardinality, width, widen_factor = 2, + connection = (x, y) -> @. relu(x) + relu(y); block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 128 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes * widen_factor - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1), - skip_projection(inplanes, outplanes, i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes - for _ in 2:nrepeats - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false), - skip_identity(inplanes, outplanes, false))) + inplanes = 64 + baseplanes = 128 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) + push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes * widen_factor + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, i != 1), + skip_projection(inplanes, outplanes, i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes + for _ in 2:nrepeats + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, false), + skip_identity(inplanes, outplanes, false))) + end + baseplanes = outplanes + # double width after every cluster of blocks + width *= widen_factor end - baseplanes = outplanes - # double width after every cluster of blocks - width *= widen_factor - end - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -77,12 +83,12 @@ Create a ResNeXt model - `nclasses`: the number of output classes """ struct ResNeXt - layers + layers::Any end function ResNeXt(cardinality, width; block_config, nclasses = 1000) - layers = resnext(cardinality, width; block_config, nclasses) - ResNeXt(layers) + layers = resnext(cardinality, width; block_config, nclasses) + ResNeXt(layers) end @functor ResNeXt @@ -92,11 +98,9 @@ end backbone(m::ResNeXt) = m.layers[1] classifier(m::ResNeXt) = m.layers[2] -const resnext_config = Dict( - 50 => (3, 4, 6, 3), - 101 => (3, 4, 23, 3), - 152 => (3, 8, 36, 3) -) +const resnext_config = Dict(50 => (3, 4, 6, 3), + 101 => (3, 4, 23, 3), + 152 => (3, 8, 36, 3)) """ ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) @@ -110,10 +114,10 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.resnext`](#). """ -function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) - @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" - - model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) - pretrain && loadpretrain!(model, string("ResNeXt", config)) - model -end \ No newline at end of file +function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, + nclasses = 1000) + @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" + model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) + pretrain && loadpretrain!(model, string("ResNeXt", config)) + model +end diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl index 169ad2e86..209dfb9a2 100644 --- a/src/convnets/squeezenet.jl +++ b/src/convnets/squeezenet.jl @@ -11,14 +11,14 @@ Create a fire module - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution """ function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes) - branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) - branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) - branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu) + branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) + branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) + branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu) - return Chain(branch_1, - Parallel(cat_channels, - branch_2, - branch_3)) + return Chain(branch_1, + Parallel(cat_channels, + branch_2, + branch_3)) end """ @@ -28,24 +28,24 @@ Create a SqueezeNet ([reference](https://arxiv.org/abs/1602.07360v4)). """ function squeezenet() - layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2), - MaxPool((3, 3), stride = 2), - fire(64, 16, 64, 64), - fire(128, 16, 64, 64), - MaxPool((3, 3), stride = 2), - fire(128, 32, 128, 128), - fire(256, 32, 128, 128), - MaxPool((3, 3), stride = 2), - fire(256, 48, 192, 192), - fire(384, 48, 192, 192), - fire(384, 64, 256, 256), - fire(512, 64, 256, 256), - Dropout(0.5), - Conv((1, 1), 512 => 1000, relu)), - AdaptiveMeanPool((1, 1)), - MLUtils.flatten) + layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2), + MaxPool((3, 3), stride = 2), + fire(64, 16, 64, 64), + fire(128, 16, 64, 64), + MaxPool((3, 3), stride = 2), + fire(128, 32, 128, 128), + fire(256, 32, 128, 128), + MaxPool((3, 3), stride = 2), + fire(256, 48, 192, 192), + fire(384, 48, 192, 192), + fire(384, 64, 256, 256), + fire(512, 64, 256, 256), + Dropout(0.5), + Conv((1, 1), 512 => 1000, relu)), + AdaptiveMeanPool((1, 1)), + MLUtils.flatten) - return layers + return layers end """ @@ -61,14 +61,13 @@ Set `pretrain=true` to load the model with pre-trained weights for ImageNet. See also [`squeezenet`](#). """ struct SqueezeNet - layers + layers::Any end function SqueezeNet(; pretrain = false) - layers = squeezenet() - pretrain && loadpretrain!(layers, "SqueezeNet") - - SqueezeNet(layers) + layers = squeezenet() + pretrain && loadpretrain!(layers, "SqueezeNet") + SqueezeNet(layers) end @functor SqueezeNet diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl index bdca0d9ee..2f8777297 100644 --- a/src/convnets/vgg.jl +++ b/src/convnets/vgg.jl @@ -11,18 +11,18 @@ A VGG block of convolution layers - `batchnorm`: set to `true` to include batch normalization after each convolution """ function vgg_block(ifilters, ofilters, depth, batchnorm) - k = (3,3) - p = (1,1) - layers = [] - for _ in 1:depth - if batchnorm - append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) - else - push!(layers, Conv(k, ifilters => ofilters, relu, pad = p)) + k = (3, 3) + p = (1, 1) + layers = [] + for _ in 1:depth + if batchnorm + append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) + else + push!(layers, Conv(k, ifilters => ofilters, relu, pad = p)) + end + ifilters = ofilters end - ifilters = ofilters - end - return layers + return layers end """ @@ -38,14 +38,14 @@ Create VGG convolution layers - `inchannels`: number of input channels """ function vgg_convolutional_layers(config, batchnorm, inchannels) - layers = [] - ifilters = inchannels - for c in config - append!(layers, vgg_block(ifilters, c..., batchnorm)) - push!(layers, MaxPool((2,2), stride=2)) - ifilters, _ = c - end - return layers + layers = [] + ifilters = inchannels + for c in config + append!(layers, vgg_block(ifilters, c..., batchnorm)) + push!(layers, MaxPool((2, 2), stride = 2)) + ifilters, _ = c + end + return layers end """ @@ -62,12 +62,12 @@ Create VGG classifier (fully connected) layers - `dropout`: the dropout level between each fully connected layer """ function vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(MLUtils.flatten, - Dense(Int(prod(imsize)), fcsize, relu), - Dropout(dropout), - Dense(fcsize, fcsize, relu), - Dropout(dropout), - Dense(fcsize, nclasses)) + return Chain(MLUtils.flatten, + Dense(Int(prod(imsize)), fcsize, relu), + Dropout(dropout), + Dense(fcsize, fcsize, relu), + Dropout(dropout), + Dense(fcsize, nclasses)) end """ @@ -88,16 +88,16 @@ Create a VGG model - `dropout`: dropout level between fully connected layers """ function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - conv = vgg_convolutional_layers(config, batchnorm, inchannels) - imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] - class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(Chain(conv), class) + conv = vgg_convolutional_layers(config, batchnorm, inchannels) + imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] + class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) + return Chain(Chain(conv), class) end -const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)], - :B => [(64,2), (128,2), (256,2), (512,2), (512,2)], - :D => [(64,2), (128,2), (256,3), (512,3), (512,3)], - :E => [(64,2), (128,2), (256,4), (512,4), (512,4)]) +const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)], + :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)], + :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)], + :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)]) const vgg_config = Dict(11 => :A, 13 => :B, @@ -105,7 +105,7 @@ const vgg_config = Dict(11 => :A, 19 => :E) struct VGG - layers + layers::Any end """ @@ -124,14 +124,14 @@ Construct a VGG model with the specified input image size. Typically, the image """ function VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - layers = vgg(imsize; config = config, - inchannels = inchannels, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = fcsize, - dropout = dropout) - - VGG(layers) + layers = vgg(imsize; config = config, + inchannels = inchannels, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = fcsize, + dropout = dropout) + + VGG(layers) end @functor VGG @@ -155,21 +155,19 @@ See also [`VGG`](#). - `pretrain`: set to `true` to load pre-trained model weights for ImageNet """ function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000) - @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" - - model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], - inchannels = 3, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = 4096, - dropout = 0.5) - - if pretrain && !batchnorm - loadpretrain!(model, string("VGG", depth)) - elseif pretrain - loadpretrain!(model, "VGG$(depth)-BN)") - end - model + @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" + model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], + inchannels = 3, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = 4096, + dropout = 0.5) + if pretrain && !batchnorm + loadpretrain!(model, string("VGG", depth)) + elseif pretrain + loadpretrain!(model, "VGG$(depth)-BN)") + end + model end # deprecations diff --git a/src/layers/attention.jl b/src/layers/attention.jl index 10baf73e9..917b58c88 100644 --- a/src/layers/attention.jl +++ b/src/layers/attention.jl @@ -10,10 +10,10 @@ Multi-head self-attention layer. - `projection`: projection layer to be used after self-attention """ struct MHAttention{P, Q, R} - nheads::Int - qkv_layer::P - attn_drop::Q - projection::R + nheads::Int + qkv_layer::P + attn_drop::Q + projection::R end """ @@ -28,31 +28,31 @@ Multi-head self-attention layer. - `attn_drop`: dropout rate after the self-attention layer - `proj_drop`: dropout rate after the projection layer """ -function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.) - @assert planes % nheads == 0 "planes should be divisible by nheads" - qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) - attn_drop = Dropout(attn_drop) - proj = Chain(Dense(planes, planes), Dropout(proj_drop)) - - MHAttention(nheads, qkv_layer, attn_drop, proj) +function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, + attn_drop = 0.0, proj_drop = 0.0) + @assert planes % nheads==0 "planes should be divisible by nheads" + qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) + attn_drop = Dropout(attn_drop) + proj = Chain(Dense(planes, planes), Dropout(proj_drop)) + MHAttention(nheads, qkv_layer, attn_drop, proj) end @functor MHAttention function (m::MHAttention)(x::AbstractArray{T, 3}) where {T} - nfeatures, seq_len, batch_size = size(x) - x_reshaped = reshape(x, nfeatures, seq_len * batch_size) - qkv = m.qkv_layer(x_reshaped) - qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) - query, key, value = chunk(qkv_reshaped, 3; dims = 4) - scale = convert(T, sqrt(size(query, 1) / m.nheads)) - key_reshaped = reshape( - permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size - ) - query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) - value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size)) - y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) - return reshape(y, :, seq_len, batch_size) + nfeatures, seq_len, batch_size = size(x) + x_reshaped = reshape(x, nfeatures, seq_len * batch_size) + qkv = m.qkv_layer(x_reshaped) + qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) + query, key, value = chunk(qkv_reshaped, 3; dims = 4) + scale = convert(T, sqrt(size(query, 1) / m.nheads)) + key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, + seq_len * batch_size) + query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) + value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + pre_projection = reshape(batched_mul(attention, value_reshaped), + (nfeatures, seq_len, batch_size)) + y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) + return reshape(y, :, seq_len, batch_size) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index ca30df8a4..8455a257e 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -24,28 +24,26 @@ Create a convolution + batch normalization pair with activation. """ function conv_bn(kernelsize, inplanes, outplanes, activation = relu; rev = false, preact = false, - initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1, + initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1, kwargs...) - layers = [] - - if rev - activations = (conv = activation, bn = identity) - bnplanes = inplanes - else - activations = (conv = identity, bn = activation) - bnplanes = outplanes - end - - if preact - rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : - activations = (conv = activation, bn = identity) - end - - push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) - push!(layers, BatchNorm(Int(bnplanes), activations.bn; - initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) - - return rev ? reverse(layers) : layers + layers = [] + if rev + activations = (conv = activation, bn = identity) + bnplanes = inplanes + else + activations = (conv = identity, bn = activation) + bnplanes = outplanes + end + if preact + rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : + activations = (conv = activation, bn = identity) + end + push!(layers, + Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) + push!(layers, + BatchNorm(Int(bnplanes), activations.bn; + initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) + return rev ? reverse(layers) : layers end """ @@ -77,18 +75,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1). - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) """ -depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; - rev = false, - initβ = Flux.zeros32, initγ = Flux.ones32, - ϵ = 1f-5, momentum = 1f-1, - stride = 1, kwargs...) = - vcat(conv_bn(kernelsize, inplanes, inplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum, - stride = stride, groups = Int(inplanes), kwargs...), - conv_bn((1, 1), inplanes, outplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum)) +function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; + rev = false, + initβ = Flux.zeros32, initγ = Flux.ones32, + ϵ = 1.0f-5, momentum = 1.0f-1, + stride = 1, kwargs...) + vcat(conv_bn(kernelsize, inplanes, inplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum, + stride = stride, groups = Int(inplanes), kwargs...), + conv_bn((1, 1), inplanes, outplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum)) +end """ skip_projection(inplanes, outplanes, downsample = false) @@ -101,9 +100,11 @@ Create a skip projection - `outplanes`: the number of output feature maps - `downsample`: set to `true` to downsample the input """ -skip_projection(inplanes, outplanes, downsample = false) = downsample ? - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +function skip_projection(inplanes, outplanes, downsample = false) + downsample ? + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +end # array -> PaddedView(0, array, outplanes) for zero padding arrays """ @@ -118,15 +119,16 @@ Create a identity projection - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#). """ function skip_identity(inplanes, outplanes) - if outplanes > inplanes - return Chain(MaxPool((1, 1), stride = 2), - y -> cat(y, zeros(eltype(y), - size(y, 1), - size(y, 2), - outplanes - inplanes, size(y, 4)); dims = 3)) - else - return identity - end + if outplanes > inplanes + return Chain(MaxPool((1, 1), stride = 2), + y -> cat(y, + zeros(eltype(y), + size(y, 1), + size(y, 2), + outplanes - inplanes, size(y, 4)); dims = 3)) + else + return identity + end end skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes) @@ -142,10 +144,11 @@ Squeeze and excitation layer used by MobileNet variants (must be >= 1) """ function squeeze_excite(channels, reduction = 4) - @assert (reduction >= 1) "`reduction` must be >= 1" - SkipConnection(Chain(AdaptiveMeanPool((1, 1)), - conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)..., - conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*) + @assert (reduction>=1) "`reduction` must be >= 1" + SkipConnection(Chain(AdaptiveMeanPool((1, 1)), + conv_bn((1, 1), channels, channels ÷ reduction, relu; + bias = false)..., + conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*) end """ @@ -166,21 +169,22 @@ Create a basic inverted residual block for MobileNet variants in a squeeze and excite layer (see [`squeeze_excite`](#)). Must be >= 1 or `nothing` for no squeeze and excite layer. """ -function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu; +function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, + activation = relu; stride, reduction = nothing) - @assert stride in [1, 2] "`stride` has to be 1 or 2" - - pad = @. (kernel_size - 1) ÷ 2 - conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) - selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) - - invres = Chain(conv1, - conv_bn(kernel_size, hidden_planes, hidden_planes, activation; - bias = false, stride, pad = pad, groups = hidden_planes)..., - selayer, - conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) - - (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres + @assert stride in [1, 2] "`stride` has to be 1 or 2" + pad = @. (kernel_size - 1) ÷ 2 + conv1 = (inplanes == hidden_planes) ? identity : + Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) + selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) + invres = Chain(conv1, + conv_bn(kernel_size, hidden_planes, hidden_planes, activation; + bias = false, stride, pad = pad, groups = hidden_planes)..., + selayer, + conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) + + (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres +end +function invertedresidual(kernel_size::Integer, args...; kwargs...) + invertedresidual((kernel_size, kernel_size), args...; kwargs...) end -invertedresidual(kernel_size::Integer, args...; kwargs...) = - invertedresidual((kernel_size, kernel_size), args...; kwargs...) diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl index fb6bc6e4d..ad4737fb2 100644 --- a/src/layers/embeddings.jl +++ b/src/layers/embeddings.jl @@ -20,16 +20,13 @@ patches. function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, norm_layer = planes -> identity, flatten = true) - - im_height, im_width = imsize - patch_height, patch_width = patch_size - - @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) - "Image dimensions must be divisible by the patch size." - - return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), - flatten ? _flatten_spatial : identity, - norm_layer(embedplanes)) + im_height, im_width = imsize + patch_height, patch_width = patch_size + @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) + "Image dimensions must be divisible by the patch size." + return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), + flatten ? _flatten_spatial : identity, + norm_layer(embedplanes)) end """ @@ -38,11 +35,13 @@ end Positional embedding layer used by many vision transformer-like models. """ struct ViPosEmbedding{T} - vectors::T + vectors::T end -ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) = - ViPosEmbedding(init((embedsize, npatches))) +function ViPosEmbedding(embedsize::Integer, npatches::Integer; + init = (dims::Dims{2}) -> rand(Float32, dims)) + ViPosEmbedding(init((embedsize, npatches))) +end (p::ViPosEmbedding)(x) = x .+ p.vectors @@ -54,14 +53,14 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models. """ struct ClassTokens{T} - token::T + token::T end ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1)) function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T} - tokens = m.token .* fill(one(T), (1, 1, size(x, 3))) - return hcat(tokens, x) + tokens = m.token .* fill(one(T), (1, 1, size(x, 3))) + return hcat(tokens, x) end @functor ClassTokens diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl index ca8f38f97..f14ba8a8c 100644 --- a/src/layers/mlp.jl +++ b/src/layers/mlp.jl @@ -11,10 +11,10 @@ Feedforward block used in many MLPMixer-like and vision-transformer models. - `dropout`: Dropout rate. - `activation`: Activation function to use. """ -function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; - dropout = 0., activation = gelu) - Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), - Dense(hidden_planes, outplanes), Dropout(dropout)) +function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; + dropout = 0.0, activation = gelu) + Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), + Dense(hidden_planes, outplanes), Dropout(dropout)) end """ @@ -33,12 +33,12 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP - `activation`: Activation function to use. """ function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer, - outplanes::Integer = inplanes; dropout = 0., activation = gelu) - @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP" - return Chain(Dense(inplanes, hidden_planes, activation), - Dropout(dropout), - gate_layer(hidden_planes), - Dense(hidden_planes ÷ 2, outplanes), - Dropout(dropout)) + outplanes::Integer = inplanes; dropout = 0.0, activation = gelu) + @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP" + return Chain(Dense(inplanes, hidden_planes, activation), + Dropout(dropout), + gate_layer(hidden_planes), + Dense(hidden_planes ÷ 2, outplanes), + Dropout(dropout)) end gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index a7bce3e6c..42405b563 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -12,16 +12,16 @@ Note that this is specifically for inputs with 4 dimensions in the format (H, W, C, N) where H, W are the height and width of the input, C is the number of channels, and N is the batch size. """ -struct ChannelLayerNorm{D,T} - diag::D - ϵ::T +struct ChannelLayerNorm{D, T} + diag::D + ϵ::T end @functor ChannelLayerNorm (m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ)) -function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5) - diag = Flux.Scale(1, 1, sz, λ) - return ChannelLayerNorm(diag, ϵ) +function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5) + diag = Flux.Scale(1, 1, sz, λ) + return ChannelLayerNorm(diag, ϵ) end diff --git a/src/layers/others.jl b/src/layers/others.jl index 366b273e4..249cacd0e 100644 --- a/src/layers/others.jl +++ b/src/layers/others.jl @@ -8,8 +8,9 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`" - `planes`: Size of channel dimension in the input. - `λ`: initialisation value for the learnable diagonal matrix. """ -LayerScale(planes::Integer, λ) = +function LayerScale(planes::Integer, λ) λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity +end """ DropPath(p) @@ -20,4 +21,4 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥ # Arguments - `p`: rate of Stochastic Depth. """ -DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity \ No newline at end of file +DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl index 880486dc2..a88118060 100644 --- a/src/other/mlpmixer.jl +++ b/src/other/mlpmixer.jl @@ -15,17 +15,17 @@ Creates a feedforward block for the MLPMixer architecture. - `drop_path_rate`: Stochastic depth rate - `activation`: the activation function to use in the MLP blocks """ -function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu) - tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] - return Chain(SkipConnection(Chain(LayerNorm(planes), - swapdims((2, 1, 3)), - mlp_layer(npatches, tokenplanes; activation, dropout), - swapdims((2, 1, 3)), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(LayerNorm(planes), - mlp_layer(planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +)) +function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, + dropout = 0.0, drop_path_rate = 0.0, activation = gelu) + tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] + return Chain(SkipConnection(Chain(LayerNorm(planes), + swapdims((2, 1, 3)), + mlp_layer(npatches, tokenplanes; activation, dropout), + swapdims((2, 1, 3)), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(LayerNorm(planes), + mlp_layer(planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +)) end """ @@ -50,27 +50,30 @@ Creates a model with the MLPMixer architecture. - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if not specified. """ -function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm, - patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0., +function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, + norm_layer = LayerNorm, + patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0, depth = 12, nclasses = 1000, kwargs...) - npatches = prod(imsize .÷ patch_size) - dp_rates = LinRange{Float32}(0., drop_path_rate, depth) - layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...) - for i in 1:depth])) - - classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses)) - return Chain(layers, classification_head) + npatches = prod(imsize .÷ patch_size) + dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth) + layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], + kwargs...) + for i in 1:depth])) + + classification_head = Chain(norm_layer(embedplanes), seconddimmean, + Dense(embedplanes, nclasses)) + return Chain(layers, classification_head) end # Configurations for MLPMixer models -mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), - :base => Dict(:depth => 12, :planes => 768), +mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), + :base => Dict(:depth => 12, :planes => 768), :large => Dict(:depth => 24, :planes => 1024), - :huge => Dict(:depth => 32, :planes => 1280)) + :huge => Dict(:depth => 32, :planes => 1280)) struct MLPMixer - layers + layers::Any end """ @@ -90,12 +93,13 @@ Creates a model with the MLPMixer architecture. See also [`Metalhead.mlpmixer`](#). """ function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses) - MLPMixer(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, + nclasses) + MLPMixer(layers) end @functor MLPMixer @@ -124,21 +128,22 @@ Creates a block for the ResMixer architecture. - `λ`: initialisation constant for the LayerScale """ function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4) -return Chain(SkipConnection(Chain(Flux.Scale(planes), - swapdims((2, 1, 3)), - Dense(npatches, npatches), - swapdims((2, 1, 3)), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(Flux.Scale(planes), - mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +)) + dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4) + return Chain(SkipConnection(Chain(Flux.Scale(planes), + swapdims((2, 1, 3)), + Dense(npatches, npatches), + swapdims((2, 1, 3)), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(Flux.Scale(planes), + mlp_layer(planes, Int(mlp_ratio * planes); dropout, + activation), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +)) end struct ResMLP - layers + layers::Any end """ @@ -158,13 +163,13 @@ Creates a model with the ResMLP architecture. See also [`Metalhead.mlpmixer`](#). """ function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, - drop_path_rate, depth, nclasses) - ResMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, + drop_path_rate, depth, nclasses) + ResMLP(layers) end @functor ResMLP @@ -185,8 +190,8 @@ Creates a spatial gating unit as described in the gMLP paper. - `proj`: the projection layer to use """ struct SpatialGatingUnit{T, F} - norm::T - proj::F + norm::T + proj::F end """ @@ -201,19 +206,19 @@ Creates a spatial gating unit as described in the gMLP paper. - `norm_layer`: the normalisation layer to use """ function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm) - gateplanes = planes ÷ 2 - norm = norm_layer(gateplanes) - proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) - return SpatialGatingUnit(norm, proj) + gateplanes = planes ÷ 2 + norm = norm_layer(gateplanes) + proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) + return SpatialGatingUnit(norm, proj) end @functor SpatialGatingUnit function (m::SpatialGatingUnit)(x) - u, v = chunk(x, 2; dims = 1) - v = m.norm(v) - v = m.proj(permutedims(v, (2, 1, 3))) - return u .* permutedims(v, (2, 1, 3)) + u, v = chunk(x, 2; dims = 1) + v = m.norm(v) + v = m.proj(permutedims(v, (2, 1, 3))) + return u .* permutedims(v, (2, 1, 3)) end """ @@ -235,17 +240,18 @@ Creates a feedforward block based on the gMLP model architecture described in th - `activation`: the activation function to use in the MLP blocks """ function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm, - mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0., + mlp_layer = gated_mlp_block, dropout = 0.0, + drop_path_rate = 0.0, activation = gelu) - channelplanes = Int(mlp_ratio * planes) - sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) - return SkipConnection(Chain(norm_layer(planes), - mlp_layer(sgu, planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +) + channelplanes = Int(mlp_ratio * planes) + sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) + return SkipConnection(Chain(norm_layer(planes), + mlp_layer(sgu, planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +) end struct gMLP - layers + layers::Any end """ @@ -265,14 +271,13 @@ Creates a model with the gMLP architecture. See also [`Metalhead.mlpmixer`](#). """ function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, - patch_size, embedplanes, drop_path_rate, depth, nclasses) - - gMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, + patch_size, embedplanes, drop_path_rate, depth, nclasses) + gMLP(layers) end @functor gMLP diff --git a/src/pretrain.jl b/src/pretrain.jl index 97ab7398e..24e6d176d 100644 --- a/src/pretrain.jl +++ b/src/pretrain.jl @@ -4,17 +4,17 @@ Load the pre-trained weights for `model` using the stored artifacts. """ function weights(model) - try - path = joinpath(@artifact_str(model), "$model.bson") - artifact = BSON.load(path, @__MODULE__) - if haskey(artifact, :model) - return artifact[:model] - else - throw(ArgumentError("No pre-trained weights available for $model.")) + try + path = joinpath(@artifact_str(model), "$model.bson") + artifact = BSON.load(path, @__MODULE__) + if haskey(artifact, :model) + return artifact[:model] + else + throw(ArgumentError("No pre-trained weights available for $model.")) + end + catch e + throw(ArgumentError("No pre-trained weights available for $model.")) end - catch e - throw(ArgumentError("No pre-trained weights available for $model.")) - end end """ diff --git a/src/utilities.jl b/src/utilities.jl index 39dbdd3b2..6adc1ec87 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -4,9 +4,9 @@ seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2) # utility function for making sure that all layers have a channel size divisible by 8 # used by MobileNet variants function _round_channels(channels, divisor, min_value = divisor) - new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) - # Make sure that round down does not go down by more than 10% - return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels + new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) + # Make sure that round down does not go down by more than 10% + return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels end """ @@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm) # Utility function for pretty printing large models function _maybe_big_show(io, model) - if isdefined(Flux, :_big_show) - if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL - Flux._big_show(io, model) - else - show(io, model) + if isdefined(Flux, :_big_show) + if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL + Flux._big_show(io, model) + else + show(io, model) + end end - end end diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl index 53932dee1..c9f6082eb 100644 --- a/src/vit-based/vit.jl +++ b/src/vit-based/vit.jl @@ -11,13 +11,15 @@ Transformer as used in the base ViT architecture. - `mlp_ratio`: ratio of MLP layers to the number of input channels - `dropout`: dropout rate """ -function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.) - layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout, - proj_drop = dropout)), +), - SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes); - dropout)), +)) - for _ in 1:depth] - Chain(layers) +function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0) + layers = [Chain(SkipConnection(prenorm(planes, + MHAttention(planes, nheads; attn_drop = dropout, + proj_drop = dropout)), +), + SkipConnection(prenorm(planes, + mlp_block(planes, floor(Int, mlp_ratio * planes); + dropout)), +)) + for _ in 1:depth] + Chain(layers) end """ @@ -44,17 +46,16 @@ Creates a Vision Transformer (ViT) model. function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1, emb_dropout = 0.1, pool = :class, nclasses = 1000) - - @assert pool in [:class, :mean] - "Pool type must be either :class (class token) or :mean (mean pooling)" - npatches = prod(imsize .÷ patch_size) - return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - ClassTokens(embedplanes), - ViPosEmbedding(embedplanes, npatches + 1), - Dropout(emb_dropout), - transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), - (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean), - Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) + @assert pool in [:class, :mean] + "Pool type must be either :class (class token) or :mean (mean pooling)" + npatches = prod(imsize .÷ patch_size) + return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + ClassTokens(embedplanes), + ViPosEmbedding(embedplanes, npatches + 1), + Dropout(emb_dropout), + transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), + (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean), + Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) end vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), @@ -62,8 +63,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), :base => (depth = 12, embedplanes = 768, nheads = 12), :large => (depth = 24, embedplanes = 1024, nheads = 16), :huge => (depth = 32, embedplanes = 1280, nheads = 16), - :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11), - :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13)) + :giant => (depth = 40, embedplanes = 1408, nheads = 16, + mlp_ratio = 48 / 11), + :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, + mlp_ratio = 64 / 13)) """ ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3, @@ -83,16 +86,16 @@ Creates a Vision Transformer (ViT) model. See also [`Metalhead.vit`](#). """ struct ViT - layers + layers::Any end function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3, patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000) - @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" - kwargs = vit_configs[mode] - layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) + @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" + kwargs = vit_configs[mode] + layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) - ViT(layers) + ViT(layers) end (m::ViT)(x) = m.layers(x) diff --git a/test/convnets.jl b/test/convnets.jl index 3540c3e9f..f62ecc3fd 100644 --- a/test/convnets.jl +++ b/test/convnets.jl @@ -5,202 +5,192 @@ using Flux PRETRAINED_MODELS = [] @testset "AlexNet" begin - model = AlexNet() - @test size(model(x_256)) == (1000, 1) - @test_throws ArgumentError AlexNet(pretrain = true) - @test gradtest(model, x_256) + model = AlexNet() + @test size(model(x_256)) == (1000, 1) + @test_throws ArgumentError AlexNet(pretrain = true) + @test gradtest(model, x_256) end GC.safepoint() GC.gc() -@testset "VGG" begin - @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false] +@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], + bn in [true, false] + m = VGG(sz, batchnorm = bn) @test size(m(x_224)) == (1000, 1) if (VGG, sz, bn) in PRETRAINED_MODELS - @test (VGG(sz, batchnorm = bn, pretrain = true); true) + @test (VGG(sz, batchnorm = bn, pretrain = true); true) else - @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) + @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() @testset "ResNet" begin - @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] - m = ResNet(sz) - - @test size(m(x_256)) == (1000, 1) - if (ResNet, sz) in PRETRAINED_MODELS - @test (ResNet(sz, pretrain = true); true) - else - @test_throws ArgumentError ResNet(sz, pretrain = true) + @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] + m = ResNet(sz) + + @test size(m(x_256)) == (1000, 1) + if (ResNet, sz) in PRETRAINED_MODELS + @test (ResNet(sz, pretrain = true); true) + else + @test_throws ArgumentError ResNet(sz, pretrain = true) + end + @test gradtest(m, x_256) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_256) - GC.safepoint() - GC.gc() - end - @testset "Shortcut C" begin - m = Metalhead.resnet(Metalhead.basicblock, :C; - channel_config = [1, 1], - block_config = [2, 2, 2, 2]) + @testset "Shortcut C" begin + m = Metalhead.resnet(Metalhead.basicblock, :C; + channel_config = [1, 1], + block_config = [2, 2, 2, 2]) - @test size(m(x_256)) == (1000, 1) - @test gradtest(m, x_256) - end + @test size(m(x_256)) == (1000, 1) + @test gradtest(m, x_256) + end end GC.safepoint() GC.gc() -@testset "ResNeXt" begin - @testset for depth in [50, 101, 152] +@testset "ResNeXt" begin @testset for depth in [50, 101, 152] m = ResNeXt(depth) @test size(m(x_224)) == (1000, 1) if ResNeXt in PRETRAINED_MODELS - @test (ResNeXt(depth, pretrain = true); true) + @test (ResNeXt(depth, pretrain = true); true) else - @test_throws ArgumentError ResNeXt(depth, pretrain = true) + @test_throws ArgumentError ResNeXt(depth, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() @testset "GoogLeNet" begin - m = GoogLeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (GoogLeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = GoogLeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (GoogLeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "Inception3" begin - m = Inception3() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError Inception3(pretrain = true) - @test gradtest(m, x_224) + m = Inception3() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError Inception3(pretrain = true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "SqueezeNet" begin - m = SqueezeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (SqueezeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = SqueezeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (SqueezeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() -@testset "DenseNet" begin - @testset for sz in [121, 161, 169, 201] +@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201] m = DenseNet(sz) @test size(m(x_224)) == (1000, 1) if (DenseNet, sz) in PRETRAINED_MODELS - @test (DenseNet(sz, pretrain = true); true) + @test (DenseNet(sz, pretrain = true); true) else - @test_throws ArgumentError DenseNet(sz, pretrain = true) + @test_throws ArgumentError DenseNet(sz, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() -@testset "MobileNet" verbose = true begin - @testset "MobileNetv1" begin - m = MobileNetv1() - - @test size(m(x_224)) == (1000, 1) - if MobileNetv1 in PRETRAINED_MODELS - @test (MobileNetv1(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv1(pretrain = true) +@testset "MobileNet" verbose=true begin + @testset "MobileNetv1" begin + m = MobileNetv1() + + @test size(m(x_224)) == (1000, 1) + if MobileNetv1 in PRETRAINED_MODELS + @test (MobileNetv1(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv1(pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end - GC.safepoint() - GC.gc() + GC.safepoint() + GC.gc() - @testset "MobileNetv2" begin - m = MobileNetv2() + @testset "MobileNetv2" begin + m = MobileNetv2() - @test size(m(x_224)) == (1000, 1) - if MobileNetv2 in PRETRAINED_MODELS - @test (MobileNetv2(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv2(pretrain = true) + @test size(m(x_224)) == (1000, 1) + if MobileNetv2 in PRETRAINED_MODELS + @test (MobileNetv2(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv2(pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end - - GC.safepoint() - GC.gc() - - @testset "MobileNetv3" verbose = true begin - @testset for mode in [:small, :large] - m = MobileNetv3(mode) - - @test size(m(x_224)) == (1000, 1) - if MobileNetv3 in PRETRAINED_MODELS - @test (MobileNetv3(mode; pretrain = true); true) - else - @test_throws ArgumentError MobileNetv3(mode; pretrain = true) - end - @test gradtest(m, x_224) - end - end + + GC.safepoint() + GC.gc() + + @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large] + m = MobileNetv3(mode) + + @test size(m(x_224)) == (1000, 1) + if MobileNetv3 in PRETRAINED_MODELS + @test (MobileNetv3(mode; pretrain = true); true) + else + @test_throws ArgumentError MobileNetv3(mode; pretrain = true) + end + @test gradtest(m, x_224) + end end end GC.safepoint() GC.gc() -@testset "ConvNeXt" verbose = true begin - @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] +@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] @testset for drop_path_rate in [0.0, 0.5] - m = ConvNeXt(mode; drop_path_rate) + m = ConvNeXt(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end GC.safepoint() GC.gc() -@testset "ConvMixer" verbose = true begin - @testset for mode in [:small, :base, :large] +@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large] m = ConvMixer(mode) @test size(m(x_224)) == (1000, 1) @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end diff --git a/test/other.jl b/test/other.jl index 0162bc4bc..db0bf223c 100644 --- a/test/other.jl +++ b/test/other.jl @@ -1,38 +1,32 @@ using Metalhead, Test using Flux -@testset "MLPMixer" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = MLPMixer(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = MLPMixer(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end -@testset "ResMLP" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = ResMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = ResMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end -@testset "gMLP" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = gMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = gMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end diff --git a/test/runtests.jl b/test/runtests.jl index 6dd4a1aa4..61af837a7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,33 +3,27 @@ using Flux using Flux: Zygote function gradtest(model, input) - y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) - gs = pb(ones(Float32, size(y))) + y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) + gs = pb(ones(Float32, size(y))) - # if we make it to here with no error, success! - return true + # if we make it to here with no error, success! + return true end x_224 = rand(Float32, 224, 224, 3, 1) x_256 = rand(Float32, 256, 256, 3, 1) # CNN tests -@testset verbose = true "ConvNets" begin - include("convnets.jl") -end +@testset verbose=true "ConvNets" begin include("convnets.jl") end GC.safepoint() GC.gc() # Other tests -@testset verbose = true "Other" begin - include("other.jl") -end +@testset verbose=true "Other" begin include("other.jl") end GC.safepoint() GC.gc() # ViT tests -@testset verbose = true "ViTs" begin - include("vit-based.jl") -end +@testset verbose=true "ViTs" begin include("vit-based.jl") end diff --git a/test/vit-based.jl b/test/vit-based.jl index 20b6ecb86..ebd1a0fc2 100644 --- a/test/vit-based.jl +++ b/test/vit-based.jl @@ -1,12 +1,10 @@ using Metalhead, Test using Flux -@testset "ViT" begin - for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] +@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] m = ViT(mode) @test size(m(x_256)) == (1000, 1) @test gradtest(m, x_256) GC.safepoint() GC.gc() - end -end +end end From c056917daf7fe9056c3ca4845e27b421ccbc8a4d Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Mon, 30 May 2022 06:20:20 +0530 Subject: [PATCH 2/8] Create .git-blame-ignore-revs --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..d62e45914 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,3 @@ +# .git-blame-ignore-revs +# Switched to SciML style for code +fd2869f57c66fa650547cd8581feeba9eda08b88 From c0b2f264126fa592832a8ad717608ea1953987a7 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Fri, 10 Jun 2022 19:16:58 +0530 Subject: [PATCH 3/8] Use `@non_differentiable` function for `fill!` in `ClassTokens` Should solve at least part of #165 --- Project.toml | 1 + src/layers/Layers.jl | 1 + src/layers/embeddings.jl | 7 +++++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index c010c513d..8adb95c2b 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.7.1" [deps] Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl index 1034136f3..e9aefd321 100644 --- a/src/layers/Layers.jl +++ b/src/layers/Layers.jl @@ -5,6 +5,7 @@ using Flux: outputsize, Zygote using Functors using Statistics using MLUtils +using ChainRulesCore include("../utilities.jl") diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl index fb6bc6e4d..5c8469aa2 100644 --- a/src/layers/embeddings.jl +++ b/src/layers/embeddings.jl @@ -59,9 +59,12 @@ end ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1)) +_fill_like(y::AbstractArray{T, 3}) where {T} = fill!(similar(y, 1, 1, size(y, 3)), one(T)) +ChainRulesCore.@non_differentiable _fill_like(y) + function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T} - tokens = m.token .* fill(one(T), (1, 1, size(x, 3))) - return hcat(tokens, x) + tokens = m.token .* _fill_like(x) + return hcat(tokens, x) end @functor ClassTokens From 0a864d50aab32760910196de8712e23e6301731d Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Fri, 10 Jun 2022 21:04:12 +0530 Subject: [PATCH 4/8] Use `MLUtils.ones_like` Also go back to indexing instead of `selectdim` to prevent scalar indexing on the GPU --- Project.toml | 1 - src/layers/Layers.jl | 1 - src/layers/embeddings.jl | 5 +---- src/vit-based/vit.jl | 2 +- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index 8adb95c2b..c010c513d 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,6 @@ version = "0.7.1" [deps] Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" -ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" diff --git a/src/layers/Layers.jl b/src/layers/Layers.jl index e9aefd321..1034136f3 100644 --- a/src/layers/Layers.jl +++ b/src/layers/Layers.jl @@ -5,7 +5,6 @@ using Flux: outputsize, Zygote using Functors using Statistics using MLUtils -using ChainRulesCore include("../utilities.jl") diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl index 5c8469aa2..06116bdc2 100644 --- a/src/layers/embeddings.jl +++ b/src/layers/embeddings.jl @@ -59,11 +59,8 @@ end ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1)) -_fill_like(y::AbstractArray{T, 3}) where {T} = fill!(similar(y, 1, 1, size(y, 3)), one(T)) -ChainRulesCore.@non_differentiable _fill_like(y) - function (m::ClassTokens)(x::AbstractArray{T, 3}) where {T} - tokens = m.token .* _fill_like(x) + tokens = m.token .* MLUtils.ones_like(x, T, (1, 1, size(x, 3))) return hcat(tokens, x) end diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl index 53932dee1..55b3e3d30 100644 --- a/src/vit-based/vit.jl +++ b/src/vit-based/vit.jl @@ -53,7 +53,7 @@ function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = ViPosEmbedding(embedplanes, npatches + 1), Dropout(emb_dropout), transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), - (pool == :class) ? x -> selectdim(x, 2, 1) : seconddimmean), + (pool == :class) ? x -> x[:, 1, :] : seconddimmean), Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) end From f4b88ec5a2eef1ca655c97e5fbeeb0c5e6c263d8 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Mon, 30 May 2022 06:19:37 +0530 Subject: [PATCH 5/8] Switch to SciML style for code --- .JuliaFormatter.toml | 2 + src/Metalhead.jl | 25 ++-- src/convnets/alexnet.jl | 43 +++--- src/convnets/convmixer.jl | 39 +++--- src/convnets/convnext.jl | 101 +++++++------- src/convnets/densenet.jl | 82 ++++++------ src/convnets/googlenet.jl | 60 ++++----- src/convnets/inception.jl | 191 ++++++++++++--------------- src/convnets/mobilenet.jl | 263 ++++++++++++++++++------------------- src/convnets/resnet.jl | 121 ++++++++--------- src/convnets/resnext.jl | 94 ++++++------- src/convnets/squeezenet.jl | 57 ++++---- src/convnets/vgg.jl | 112 ++++++++-------- src/layers/attention.jl | 52 ++++---- src/layers/conv.jl | 134 ++++++++++--------- src/layers/embeddings.jl | 27 ++-- src/layers/mlp.jl | 22 ++-- src/layers/normalise.jl | 12 +- src/layers/others.jl | 5 +- src/other/mlpmixer.jl | 155 +++++++++++----------- src/pretrain.jl | 20 +-- src/utilities.jl | 18 +-- src/vit-based/vit.jl | 32 +++-- test/convnets.jl | 200 ++++++++++++++-------------- test/other.jl | 48 +++---- test/runtests.jl | 20 +-- test/vit-based.jl | 6 +- 27 files changed, 960 insertions(+), 981 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 000000000..93a9e7665 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +style = "sciml" +whitespace_in_kwargs = true diff --git a/src/Metalhead.jl b/src/Metalhead.jl index a0fb3785a..e465b6981 100644 --- a/src/Metalhead.jl +++ b/src/Metalhead.jl @@ -37,22 +37,23 @@ include("vit-based/vit.jl") include("pretrain.jl") -export AlexNet, - VGG, VGG11, VGG13, VGG16, VGG19, - ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, - GoogLeNet, Inception3, SqueezeNet, - DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, - ResNeXt, - MobileNetv1, MobileNetv2, MobileNetv3, - MLPMixer, ResMLP, gMLP, - ViT, - ConvNeXt, ConvMixer +export AlexNet, + VGG, VGG11, VGG13, VGG16, VGG19, + ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, + GoogLeNet, Inception3, SqueezeNet, + DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, + ResNeXt, + MobileNetv1, MobileNetv2, MobileNetv3, + MLPMixer, ResMLP, gMLP, + ViT, + ConvNeXt, ConvMixer # use Flux._big_show to pretty print large models -for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, +for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, + :ResNeXt, :MobileNetv1, :MobileNetv2, :MobileNetv3, :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer) - @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) + @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) end end # module diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl index ea3962c2a..93bf1cd67 100644 --- a/src/convnets/alexnet.jl +++ b/src/convnets/alexnet.jl @@ -8,23 +8,23 @@ Create an AlexNet model - `nclasses`: the number of output classes """ function alexnet(; nclasses = 1000) - layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((5, 5), 64 => 192, relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((3, 3), 192 => 384, relu, pad = (1, 1)), - Conv((3, 3), 384 => 256, relu, pad = (1, 1)), - Conv((3, 3), 256 => 256, relu, pad = (1, 1)), - MaxPool((3, 3), stride = (2, 2)), - AdaptiveMeanPool((6,6))), - Chain(MLUtils.flatten, - Dropout(0.5), - Dense(256 * 6 * 6, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dense(4096, nclasses))) - - return layers + layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)), + MaxPool((3, 3), stride = (2, 2)), + Conv((5, 5), 64 => 192, relu, pad = (2, 2)), + MaxPool((3, 3), stride = (2, 2)), + Conv((3, 3), 192 => 384, relu, pad = (1, 1)), + Conv((3, 3), 384 => 256, relu, pad = (1, 1)), + Conv((3, 3), 256 => 256, relu, pad = (1, 1)), + MaxPool((3, 3), stride = (2, 2)), + AdaptiveMeanPool((6, 6))), + Chain(MLUtils.flatten, + Dropout(0.5), + Dense(256 * 6 * 6, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dense(4096, nclasses))) + + return layers end """ @@ -41,14 +41,13 @@ See also [`alexnet`](#). - `nclasses`: the number of output classes """ struct AlexNet - layers + layers::Any end function AlexNet(; pretrain = false, nclasses = 1000) - layers = alexnet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "AlexNet") - - AlexNet(layers) + layers = alexnet(nclasses = nclasses) + pretrain && loadpretrain!(layers, "AlexNet") + AlexNet(layers) end @functor AlexNet diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl index 01a6e61be..2a6aeae05 100644 --- a/src/convnets/convmixer.jl +++ b/src/convnets/convmixer.jl @@ -16,20 +16,24 @@ Creates a ConvMixer model. """ function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000) - stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1]) - blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; - preact = true, groups = planes, pad = SamePad())), +), - conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth] - head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) - return Chain(Chain(stem..., Chain(blocks)), head) + stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, + stride = patch_size[1]) + blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; + preact = true, groups = planes, + pad = SamePad())), +), + conv_bn((1, 1), planes, planes, activation; preact = true)...) + for _ in 1:depth] + head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) + return Chain(Chain(stem..., Chain(blocks)), head) end convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7)), + :patch_size => (7, 7)), :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7), - :patch_size => (7, 7)), - :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7))) + :patch_size => (7, 7)), + :large => Dict(:planes => 1024, :depth => 20, + :kernel_size => (9, 9), + :patch_size => (7, 7))) """ ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) @@ -44,16 +48,17 @@ Creates a ConvMixer model. - `nclasses`: number of classes in the output """ struct ConvMixer - layers + layers::Any end function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) - planes = convmixer_config[mode][:planes] - depth = convmixer_config[mode][:depth] - kernel_size = convmixer_config[mode][:kernel_size] - patch_size = convmixer_config[mode][:patch_size] - layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses) - return ConvMixer(layers) + planes = convmixer_config[mode][:planes] + depth = convmixer_config[mode][:depth] + kernel_size = convmixer_config[mode][:kernel_size] + patch_size = convmixer_config[mode][:patch_size] + layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, + nclasses) + return ConvMixer(layers) end @functor ConvMixer diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl index 1621803bf..0a44e7482 100644 --- a/src/convnets/convnext.jl +++ b/src/convnets/convnext.jl @@ -9,15 +9,15 @@ Creates a single block of ConvNeXt. - `drop_path_rate`: Stochastic depth rate. - `λ`: Init value for LayerScale """ -function convnextblock(planes, drop_path_rate = 0., λ = 1f-6) - layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), - swapdims((3, 1, 2, 4)), - LayerNorm(planes; ϵ = 1f-6), - mlp_block(planes, 4 * planes), - LayerScale(planes, λ), - swapdims((2, 3, 1, 4)), - DropPath(drop_path_rate)), +) - return layers +function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6) + layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), + swapdims((3, 1, 2, 4)), + LayerNorm(planes; ϵ = 1.0f-6), + mlp_block(planes, 4 * planes), + LayerScale(planes, λ), + swapdims((2, 3, 1, 4)), + DropPath(drop_path_rate)), +) + return layers end """ @@ -34,45 +34,48 @@ Creates the layers for a ConvNeXt model. - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) - `nclasses`: number of output classes """ -function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000) - @assert length(depths) == length(planes) "`planes` should have exactly one value for each block" - - downsample_layers = [] - stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), - ChannelLayerNorm(planes[1]; ϵ = 1f-6)) - push!(downsample_layers, stem) - for m in 1:length(depths) - 1 - downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6), - Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) - push!(downsample_layers, downsample_layer) - end - - stages = [] - dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths)) - cur = 0 - for i in 1:length(depths) - push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) - cur += depths[i] - end - - backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) - head = Chain(GlobalMeanPool(), - MLUtils.flatten, - LayerNorm(planes[end]), - Dense(planes[end], nclasses)) - - return Chain(Chain(backbone), head) +function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, + nclasses = 1000) + @assert length(depths)==length(planes) "`planes` should have exactly one value for each block" + downsample_layers = [] + stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), + ChannelLayerNorm(planes[1]; ϵ = 1.0f-6)) + push!(downsample_layers, stem) + for m in 1:(length(depths) - 1) + downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6), + Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) + push!(downsample_layers, downsample_layer) + end + stages = [] + dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths)) + cur = 0 + for i in 1:length(depths) + push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) + cur += depths[i] + end + backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) + head = Chain(GlobalMeanPool(), + MLUtils.flatten, + LayerNorm(planes[end]), + Dense(planes[end], nclasses)) + + return Chain(Chain(backbone), head) end # Configurations for ConvNeXt models -convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]), - :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]), - :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]), - :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]), - :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048])) +convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], + :planes => [96, 192, 384, 768]), + :small => Dict(:depths => [3, 3, 27, 3], + :planes => [96, 192, 384, 768]), + :base => Dict(:depths => [3, 3, 27, 3], + :planes => [128, 256, 512, 1024]), + :large => Dict(:depths => [3, 3, 27, 3], + :planes => [192, 384, 768, 1536]), + :xlarge => Dict(:depths => [3, 3, 27, 3], + :planes => [256, 512, 1024, 2048])) struct ConvNeXt - layers + layers::Any end """ @@ -89,13 +92,13 @@ Creates a ConvNeXt model. See also [`Metalhead.convnext`](#). """ -function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, +function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, nclasses = 1000) - @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" - depths = convnext_configs[mode][:depths] - planes = convnext_configs[mode][:planes] - layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) - return ConvNeXt(layers) + @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" + depths = convnext_configs[mode][:depths] + planes = convnext_configs[mode][:planes] + layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) + return ConvNeXt(layers) end (m::ConvNeXt)(x) = m.layers(x) diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl index bda7a321d..be98509e6 100644 --- a/src/convnets/densenet.jl +++ b/src/convnets/densenet.jl @@ -10,11 +10,12 @@ Create a Densenet bottleneck layer (and scaling factor for inner feature maps; see ref) """ function dense_bottleneck(inplanes, outplanes) - inner_channels = 4 * outplanes - m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., - conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...) + inner_channels = 4 * outplanes + m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., + conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, + rev = true)...) - SkipConnection(m, cat_channels) + SkipConnection(m, cat_channels) end """ @@ -27,8 +28,10 @@ Create a DenseNet transition sequence - `inplanes`: number of input feature maps - `outplanes`: number of output feature maps """ -transition(inplanes, outplanes) = - Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2))) +function transition(inplanes, outplanes) + Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., + MeanPool((2, 2))) +end """ dense_block(inplanes, growth_rates) @@ -42,8 +45,10 @@ the number of output feature maps by `growth_rates` with each block - `growth_rates`: the growth (additive) rates of output feature maps after each block (a vector of `k`s from the ref) """ -dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o) - for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)] +function dense_block(inplanes, growth_rates) + [dense_bottleneck(i, o) + for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)] +end """ densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) @@ -59,24 +64,24 @@ Create a DenseNet model - `nclasses`: the number of output classes """ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) - push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1))) - - outplanes = 0 - for (i, rates) in enumerate(growth_rates) - outplanes = inplanes + sum(rates) - append!(layers, dense_block(inplanes, rates)) - (i != length(growth_rates)) && - push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) - inplanes = floor(Int, outplanes * reduction) - end - push!(layers, BatchNorm(outplanes, relu)) - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dense(outplanes, nclasses))) + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) + push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1))) + + outplanes = 0 + for (i, rates) in enumerate(growth_rates) + outplanes = inplanes + sum(rates) + append!(layers, dense_block(inplanes, rates)) + (i != length(growth_rates)) && + push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) + inplanes = floor(Int, outplanes * reduction) + end + push!(layers, BatchNorm(outplanes, relu)) + + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dense(outplanes, nclasses))) end """ @@ -91,9 +96,10 @@ Create a DenseNet model - `reduction`: the factor by which the number of feature maps is scaled across each transition - `nclasses`: the number of output classes """ -densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) = - densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; - reduction = reduction, nclasses = nclasses) +function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) + densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; + reduction = reduction, nclasses = nclasses) +end """ DenseNet(nblocks::NTuple{N, <:Integer}; @@ -110,16 +116,16 @@ See also [`densenet`](#). - `nclasses`: the number of output classes """ struct DenseNet - layers + layers::Any end function DenseNet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N} - layers = densenet(nblocks; growth_rate = growth_rate, - reduction = reduction, - nclasses = nclasses) + layers = densenet(nblocks; growth_rate = growth_rate, + reduction = reduction, + nclasses = nclasses) - DenseNet(layers) + DenseNet(layers) end @functor DenseNet @@ -148,11 +154,11 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.densenet`](#). """ function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000) - @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." - model = DenseNet(densenet_config[config]; nclasses = nclasses) + @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." + model = DenseNet(densenet_config[config]; nclasses = nclasses) - pretrain && loadpretrain!(model, string("DenseNet", config)) - return model + pretrain && loadpretrain!(model, string("DenseNet", config)) + return model end # deprecations diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl index bc42a052f..40dd5ff41 100644 --- a/src/convnets/googlenet.jl +++ b/src/convnets/googlenet.jl @@ -15,16 +15,12 @@ Create an inception module for use in GoogLeNet """ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj) branch1 = Chain(Conv((1, 1), inplanes => out_1x1)) - branch2 = Chain(Conv((1, 1), inplanes => red_3x3), Conv((3, 3), red_3x3 => out_3x3; pad = 1)) - branch3 = Chain(Conv((1, 1), inplanes => red_5x5), - Conv((5, 5), red_5x5 => out_5x5; pad = 2)) - - branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1), + Conv((5, 5), red_5x5 => out_5x5; pad = 2)) + branch4 = Chain(MaxPool((3, 3), stride = 1, pad = 1), Conv((1, 1), inplanes => pool_proj)) - return Parallel(cat_channels, branch1, branch2, branch3, branch4) end @@ -39,28 +35,27 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet) - `nclasses`: the number of output classes """ function googlenet(; nclasses = 1000) - layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), - MaxPool((3, 3), stride = 2, pad = 1), - Conv((1, 1), 64 => 64), - Conv((3, 3), 64 => 192; pad = 1), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(192, 64, 96, 128, 16, 32, 32), - _inceptionblock(256, 128, 128, 192, 32, 96, 64), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(480, 192, 96, 208, 16, 48, 64), - _inceptionblock(512, 160, 112, 224, 24, 64, 64), - _inceptionblock(512, 128, 128, 256, 24, 64, 64), - _inceptionblock(512, 112, 144, 288, 32, 64, 64), - _inceptionblock(528, 256, 160, 320, 32, 128, 128), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(832, 256, 160, 320, 32, 128, 128), - _inceptionblock(832, 384, 192, 384, 48, 128, 128)), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dropout(0.4), - Dense(1024, nclasses))) - - return layers + layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), + MaxPool((3, 3), stride = 2, pad = 1), + Conv((1, 1), 64 => 64), + Conv((3, 3), 64 => 192; pad = 1), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(192, 64, 96, 128, 16, 32, 32), + _inceptionblock(256, 128, 128, 192, 32, 96, 64), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(480, 192, 96, 208, 16, 48, 64), + _inceptionblock(512, 160, 112, 224, 24, 64, 64), + _inceptionblock(512, 128, 128, 256, 24, 64, 64), + _inceptionblock(512, 112, 144, 288, 32, 64, 64), + _inceptionblock(528, 256, 160, 320, 32, 128, 128), + MaxPool((3, 3), stride = 2, pad = 1), + _inceptionblock(832, 256, 160, 320, 32, 128, 128), + _inceptionblock(832, 384, 192, 384, 48, 128, 128)), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dropout(0.4), + Dense(1024, nclasses))) + return layers end """ @@ -79,14 +74,13 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`) See also [`googlenet`](#). """ struct GoogLeNet - layers + layers::Any end function GoogLeNet(; pretrain = false, nclasses = 1000) - layers = googlenet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "GoogLeNet") - - GoogLeNet(layers) + layers = googlenet(nclasses = nclasses) + pretrain && loadpretrain!(layers, "GoogLeNet") + GoogLeNet(layers) end @functor GoogLeNet diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl index ef8ab81ef..2673d1b8e 100644 --- a/src/convnets/inception.jl +++ b/src/convnets/inception.jl @@ -9,20 +9,16 @@ Create an Inception-v3 style-A module - `pool_proj`: the number of output feature maps for the pooling projection """ function inception_a(inplanes, pool_proj) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) - - branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., - conv_bn((5, 5), 48, 64; pad = 2)...) - - branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; pad = 1)...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, pool_proj)...) - - return Parallel(cat_channels, - branch1x1, branch5x5, branch3x3, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) + branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., + conv_bn((5, 5), 48, 64; pad = 2)...) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; pad = 1)...) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, pool_proj)...) + return Parallel(cat_channels, + branch1x1, branch5x5, branch3x3, branch_pool) end """ @@ -35,16 +31,13 @@ Create an Inception-v3 style-B module - `inplanes`: number of input feature maps """ function inception_b(inplanes) - branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride = 2) - - return Parallel(cat_channels, - branch3x3_1, branch3x3_2, branch_pool) + branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; stride = 2)...) + branch_pool = MaxPool((3, 3), stride = 2) + return Parallel(cat_channels, + branch3x3_1, branch3x3_2, branch_pool) end """ @@ -59,23 +52,19 @@ Create an Inception-v3 style-C module - `n`: the "grid size" (kernel size) for the convolution layers """ function inception_c(inplanes, inner_planes, n = 7) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) - - branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) - - branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, branch7x7_1, branch7x7_2, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) + branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) + branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, branch7x7_1, branch7x7_2, branch_pool) end """ @@ -88,18 +77,15 @@ Create an Inception-v3 style-D module - `inplanes`: number of input feature maps """ function inception_d(inplanes) - branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((3, 3), 192, 320; stride = 2)...) - - branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((1, 7), 192, 192; pad = (0, 3))..., - conv_bn((7, 1), 192, 192; pad = (3, 0))..., - conv_bn((3, 3), 192, 192; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride=2) - - return Parallel(cat_channels, - branch3x3, branch7x7x3, branch_pool) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((3, 3), 192, 320; stride = 2)...) + branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((1, 7), 192, 192; pad = (0, 3))..., + conv_bn((7, 1), 192, 192; pad = (3, 0))..., + conv_bn((3, 3), 192, 192; stride = 2)...) + branch_pool = MaxPool((3, 3), stride = 2) + return Parallel(cat_channels, + branch3x3, branch7x7x3, branch_pool) end """ @@ -112,30 +98,25 @@ Create an Inception-v3 style-E module - `inplanes`: number of input feature maps """ function inception_e(inplanes) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) - - branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) - branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., - conv_bn((3, 3), 448, 384; pad = 1)...) - branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, - Chain(branch3x3_1, - Parallel(cat_channels, - branch3x3_1a, branch3x3_1b)), - - Chain(branch3x3_2, - Parallel(cat_channels, - branch3x3_2a, branch3x3_2b)), - branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) + branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) + branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., + conv_bn((3, 3), 448, 384; pad = 1)...) + branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, + Chain(branch3x3_1, + Parallel(cat_channels, + branch3x3_1a, branch3x3_1b)), + Chain(branch3x3_2, + Parallel(cat_channels, + branch3x3_2a, branch3x3_2b)), + branch_pool) end """ @@ -150,30 +131,29 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)). `inception3` does not currently support pretrained weights. """ function inception3(; nclasses = 1000) - layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., - conv_bn((3, 3), 32, 32)..., - conv_bn((3, 3), 32, 64; pad = 1)..., - MaxPool((3, 3), stride = 2), - conv_bn((1, 1), 64, 80)..., - conv_bn((3, 3), 80, 192)..., - MaxPool((3, 3), stride = 2), - inception_a(192, 32), - inception_a(256, 64), - inception_a(288, 64), - inception_b(288), - inception_c(768, 128), - inception_c(768, 160), - inception_c(768, 160), - inception_c(768, 192), - inception_d(768), - inception_e(1280), - inception_e(2048)), - Chain(AdaptiveMeanPool((1, 1)), - Dropout(0.2), - MLUtils.flatten, - Dense(2048, nclasses))) - - return layer + layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., + conv_bn((3, 3), 32, 32)..., + conv_bn((3, 3), 32, 64; pad = 1)..., + MaxPool((3, 3), stride = 2), + conv_bn((1, 1), 64, 80)..., + conv_bn((3, 3), 80, 192)..., + MaxPool((3, 3), stride = 2), + inception_a(192, 32), + inception_a(256, 64), + inception_a(288, 64), + inception_b(288), + inception_c(768, 128), + inception_c(768, 160), + inception_c(768, 160), + inception_c(768, 192), + inception_d(768), + inception_e(1280), + inception_e(2048)), + Chain(AdaptiveMeanPool((1, 1)), + Dropout(0.2), + MLUtils.flatten, + Dense(2048, nclasses))) + return layer end """ @@ -190,14 +170,13 @@ See also [`inception3`](#). `Inception3` does not currently support pretrained weights. """ struct Inception3 - layers + layers::Any end function Inception3(; pretrain = false, nclasses = 1000) - layers = inception3(nclasses = nclasses) - pretrain && loadpretrain!(layers, "Inception3") - - Inception3(layers) + layers = inception3(nclasses = nclasses) + pretrain && loadpretrain!(layers, "Inception3") + Inception3(layers) end @functor Inception3 diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl index 2dfd06f8d..fed893142 100644 --- a/src/convnets/mobilenet.jl +++ b/src/convnets/mobilenet.jl @@ -27,37 +27,37 @@ function mobilenetv1(width_mult, config; inchannels = 3, nclasses = 1000, fcsize = 1024) - layers = [] - for (dw, outch, stride, nrepeats) in config - outch = Int(outch * width_mult) - for _ in 1:nrepeats - layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; - stride = stride, pad = 1) : - conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) - append!(layers, layer) - inchannels = outch + layers = [] + for (dw, outch, stride, nrepeats) in config + outch = Int(outch * width_mult) + for _ in 1:nrepeats + layer = dw ? + depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; + stride = stride, pad = 1) : + conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) + append!(layers, layer) + inchannels = outch + end end - end - - return Chain(Chain(layers), - Chain(GlobalMeanPool(), - MLUtils.flatten, - Dense(inchannels, fcsize, activation), - Dense(fcsize, nclasses))) + return Chain(Chain(layers), + Chain(GlobalMeanPool(), + MLUtils.flatten, + Dense(inchannels, fcsize, activation), + Dense(fcsize, nclasses))) end const mobilenetv1_configs = [ -# dw, c, s, r - (false, 32, 2, 1), - ( true, 64, 1, 1), - ( true, 128, 2, 1), - ( true, 128, 1, 1), - ( true, 256, 2, 1), - ( true, 256, 1, 1), - ( true, 512, 2, 1), - ( true, 512, 1, 5), - ( true, 1024, 2, 1), - ( true, 1024, 1, 1) + # dw, c, s, r + (false, 32, 2, 1), + (true, 64, 1, 1), + (true, 128, 2, 1), + (true, 128, 1, 1), + (true, 256, 2, 1), + (true, 256, 1, 1), + (true, 512, 2, 1), + (true, 512, 1, 5), + (true, 1024, 2, 1), + (true, 1024, 1, 1), ] """ @@ -77,14 +77,13 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet. See also [`Metalhead.mobilenetv1`](#). """ struct MobileNetv1 - layers + layers::Any end function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv1")) - - return MobileNetv1(layers) + layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv1")) + return MobileNetv1(layers) end @functor MobileNetv1 @@ -95,7 +94,6 @@ backbone(m::MobileNetv1) = m.layers[1] classifier(m::MobileNetv1) = m.layers[2] # MobileNetv2 - """ mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000) @@ -115,44 +113,45 @@ Create a MobileNetv2 model. - `nclasses`: The number of output classes """ function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000) - # building first layer - inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2)) - - # building inverted residual blocks - for (t, c, n, s, a) in configs - outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) - for i in 1:n - push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a; - stride = i == 1 ? s : 1)) - inplanes = outplanes + # building first layer + inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2)) + # building inverted residual blocks + for (t, c, n, s, a) in configs + outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) + for i in 1:n + push!(layers, + invertedresidual(3, inplanes, inplanes * t, outplanes, a; + stride = i == 1 ? s : 1)) + inplanes = outplanes + end end - end - - # building last several layers - outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : - max_width - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses))) + # building last several layers + outplanes = (width_mult > 1) ? + _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : + max_width + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(outplanes, nclasses))) end # Layer configurations for MobileNetv2 const mobilenetv2_configs = [ -# t, c, n, s, a - (1, 16, 1, 1, relu6), - (6, 24, 2, 2, relu6), - (6, 32, 3, 2, relu6), - (6, 64, 4, 2, relu6), - (6, 96, 3, 1, relu6), - (6, 160, 3, 2, relu6), - (6, 320, 1, 1, relu6) + # t, c, n, s, a + (1, 16, 1, 1, relu6), + (6, 24, 2, 2, relu6), + (6, 32, 3, 2, relu6), + (6, 64, 4, 2, relu6), + (6, 96, 3, 1, relu6), + (6, 160, 3, 2, relu6), + (6, 320, 1, 1, relu6), ] # Model definition for MobileNetv2 struct MobileNetv2 - layers + layers::Any end """ @@ -172,10 +171,9 @@ Set `pretrain` to `true` to load the pretrained weights for ImageNet. See also [`Metalhead.mobilenetv2`](#). """ function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv2")) - - MobileNetv2(layers) + layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv2")) + MobileNetv2(layers) end @functor MobileNetv2 @@ -186,7 +184,6 @@ backbone(m::MobileNetv2) = m.layers[1] classifier(m::MobileNetv2) = m.layers[2] # MobileNetv3 - """ mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000) @@ -208,71 +205,70 @@ Create a MobileNetv3 model. - `nclasses`: the number of output classes """ function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000) - # building first layer - inplanes = _round_channels(16 * width_mult, 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) - explanes = 0 - # building inverted residual blocks - for (k, t, c, r, a, s) in configs - # inverted residual layers - outplanes = _round_channels(c * width_mult, 8) - explanes = _round_channels(inplanes * t, 8) - push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a; - stride = s, reduction = r)) - inplanes = outplanes - end - - # building last several layers - output_channel = max_width - output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel - classifier = Chain(Dense(explanes, output_channel, hardswish), - Dropout(0.2), - Dense(output_channel, nclasses)) - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) + # building first layer + inplanes = _round_channels(16 * width_mult, 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) + explanes = 0 + # building inverted residual blocks + for (k, t, c, r, a, s) in configs + # inverted residual layers + outplanes = _round_channels(c * width_mult, 8) + explanes = _round_channels(inplanes * t, 8) + push!(layers, + invertedresidual(k, inplanes, explanes, outplanes, a; + stride = s, reduction = r)) + inplanes = outplanes + end + # building last several layers + output_channel = max_width + output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : + output_channel + classifier = Chain(Dense(explanes, output_channel, hardswish), + Dropout(0.2), + Dense(output_channel, nclasses)) + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) end # Configurations for small and large mode for MobileNetv3 -mobilenetv3_configs = Dict( - :small => [ - # k, t, c, SE, a, s - (3, 1, 16, 4, relu, 2), - (3, 4.5, 24, nothing, relu, 2), - (3, 3.67, 24, nothing, relu, 1), - (5, 4, 40, 4, hardswish, 2), - (5, 6, 40, 4, hardswish, 1), - (5, 6, 40, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 2), - (5, 6, 96, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 1), - ], - :large => [ - # k, t, c, SE, a, s - (3, 1, 16, nothing, relu, 1), - (3, 4, 24, nothing, relu, 2), - (3, 3, 24, nothing, relu, 1), - (5, 3, 40, 4, relu, 2), - (5, 3, 40, 4, relu, 1), - (5, 3, 40, 4, relu, 1), - (3, 6, 80, nothing, hardswish, 2), - (3, 2.5, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 2), - (5, 6, 160, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 1) - ] -) +mobilenetv3_configs = Dict(:small => [ + # k, t, c, SE, a, s + (3, 1, 16, 4, relu, 2), + (3, 4.5, 24, nothing, relu, 2), + (3, 3.67, 24, nothing, relu, 1), + (5, 4, 40, 4, hardswish, 2), + (5, 6, 40, 4, hardswish, 1), + (5, 6, 40, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 2), + (5, 6, 96, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 1), + ], + :large => [ + # k, t, c, SE, a, s + (3, 1, 16, nothing, relu, 1), + (3, 4, 24, nothing, relu, 2), + (3, 3, 24, nothing, relu, 1), + (5, 3, 40, 4, relu, 2), + (5, 3, 40, 4, relu, 1), + (5, 3, 40, 4, relu, 1), + (3, 6, 80, nothing, hardswish, 2), + (3, 2.5, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 2), + (5, 6, 160, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 1), + ]) # Model definition for MobileNetv3 struct MobileNetv3 - layers + layers::Any end """ @@ -292,13 +288,14 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.mobilenetv3`](#). """ -function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000) - @assert mode in [:large, :small] "`mode` has to be either :large or :small" - - max_width = (mode == :large) ? 1280 : 1024 - layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) - MobileNetv3(layers) +function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, + nclasses = 1000) + @assert mode in [:large, :small] "`mode` has to be either :large or :small" + max_width = (mode == :large) ? 1280 : 1024 + layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, + nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) + MobileNetv3(layers) end @functor MobileNetv3 diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl index d91d65d6a..54bb5cb35 100644 --- a/src/convnets/resnet.jl +++ b/src/convnets/resnet.jl @@ -11,9 +11,11 @@ Create a basic residual block - `downsample`: set to `true` to downsample the input """ function basicblock(inplanes, outplanes, downsample = false) - stride = downsample ? 2 : 1 - Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...) + stride = downsample ? 2 : 1 + Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, + bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, + bias = false)...) end """ @@ -36,9 +38,11 @@ The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead. """ function bottleneck(inplanes, outplanes, downsample = false; stride = [1, (downsample ? 2 : 1), 1]) - Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)..., - conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...) + Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, + bias = false)..., + conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], + bias = false)...) end """ @@ -55,8 +59,9 @@ layer which has a stride of 2. within the residual block - `downsample`: set to `true` to downsample the input """ -bottleneck_v1(inplanes, outplanes, downsample = false) = +function bottleneck_v1(inplanes, outplanes, downsample = false) bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1]) +end """ resnet(block, residuals::NTuple{2, Any}, connection = addrelu; @@ -78,31 +83,33 @@ Create a ResNet model """ function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu; channel_config, block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 64 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes .* channel_config - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1), - residuals[i][1](inplanes, outplanes[end], i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes[end] - for _ in 2:nrepeats - push!(layers, Parallel(connection, block(inplanes, outplanes, false), - residuals[i][2](inplanes, outplanes[end], false))) - inplanes = outplanes[end] + inplanes = 64 + baseplanes = 64 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) + push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes .* channel_config + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, block(inplanes, outplanes, i != 1), + residuals[i][1](inplanes, outplanes[end], i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes[end] + for _ in 2:nrepeats + push!(layers, + Parallel(connection, block(inplanes, outplanes, false), + residuals[i][2](inplanes, outplanes[end], false))) + inplanes = outplanes[end] + end + # next set of output plane base is doubled + baseplanes *= 2 end - # next set of output plane base is doubled - baseplanes *= 2 - end - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -126,17 +133,14 @@ Create a ResNet model - `nclasses`: the number of output classes """ function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...) - shortcut_dict = Dict( - :A => (skip_identity, skip_identity), - :B => (skip_projection, skip_identity), - :C => (skip_projection, skip_projection)) - - if any(sc -> !haskey(shortcut_dict,sc),shortcut_config) - error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") - end - - shortcut = [shortcut_dict[sc] for sc in shortcut_config] - resnet(block, shortcut, args...; kwargs...) + shortcut_dict = Dict(:A => (skip_identity, skip_identity), + :B => (skip_projection, skip_identity), + :C => (skip_projection, skip_projection)) + if any(sc -> !haskey(shortcut_dict, sc), shortcut_config) + error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") + end + shortcut = [shortcut_dict[sc] for sc in shortcut_config] + resnet(block, shortcut, args...; kwargs...) end function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...) @@ -144,14 +148,15 @@ function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs... block_config = block_config, kwargs...) end -resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...) +function resnet(block, residuals::NTuple{2}, args...; kwargs...) + resnet(block, [residuals], args...; kwargs...) +end -const resnet_config = - Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), - 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), - 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), - 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), - 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) +const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), + 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), + 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), + 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), + 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) """ ResNet(channel_config, block_config, shortcut_config; @@ -173,19 +178,18 @@ See also [`resnet`](#). - `nclasses`: the number of output classes """ struct ResNet - layers + layers::Any end function ResNet(channel_config, block_config, shortcut_config; block, connection = addrelu, nclasses = 1000) - layers = resnet(block, - shortcut_config, - connection; - channel_config = channel_config, - block_config = block_config, - nclasses = nclasses) - - ResNet(layers) + layers = resnet(block, + shortcut_config, + connection; + channel_config = channel_config, + block_config = block_config, + nclasses = nclasses) + ResNet(layers) end @functor ResNet @@ -238,7 +242,6 @@ resnet50_v1 = ResNet([1, 1, 4], [3, 4, 6, 3], :B; block = Metalhead.bottleneck_v """ function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000) @assert depth in keys(resnet_config) "`depth` must be one of $(sort(collect(keys(resnet_config))))" - config, block = resnet_config[depth] model = ResNet(config...; block = block, nclasses = nclasses) pretrain && loadpretrain!(model, string("ResNet", depth)) diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl index eaa66f98f..41910cb26 100644 --- a/src/convnets/resnext.jl +++ b/src/convnets/resnext.jl @@ -12,12 +12,12 @@ Create a basic residual block as defined in the paper for ResNeXt - `downsample`: set to `true` to downsample the input """ function resnextblock(inplanes, outplanes, cardinality, width, downsample = false) - stride = downsample ? 2 : 1 - hidden_channels = cardinality * width - return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., - conv_bn((3, 3), hidden_channels, hidden_channels; - stride = stride, pad = 1, bias = false, groups = cardinality)..., - conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) + stride = downsample ? 2 : 1 + hidden_channels = cardinality * width + return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., + conv_bn((3, 3), hidden_channels, hidden_channels; + stride = stride, pad = 1, bias = false, groups = cardinality)..., + conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) end """ @@ -35,33 +35,39 @@ Create a ResNeXt model - `block_config`: a list of the number of residual blocks at each stage - `nclasses`: the number of output classes """ -function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y); +function resnext(cardinality, width, widen_factor = 2, + connection = (x, y) -> @. relu(x) + relu(y); block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 128 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes * widen_factor - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1), - skip_projection(inplanes, outplanes, i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes - for _ in 2:nrepeats - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false), - skip_identity(inplanes, outplanes, false))) + inplanes = 64 + baseplanes = 128 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) + push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes * widen_factor + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, i != 1), + skip_projection(inplanes, outplanes, i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes + for _ in 2:nrepeats + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, false), + skip_identity(inplanes, outplanes, false))) + end + baseplanes = outplanes + # double width after every cluster of blocks + width *= widen_factor end - baseplanes = outplanes - # double width after every cluster of blocks - width *= widen_factor - end - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -77,12 +83,12 @@ Create a ResNeXt model - `nclasses`: the number of output classes """ struct ResNeXt - layers + layers::Any end function ResNeXt(cardinality, width; block_config, nclasses = 1000) - layers = resnext(cardinality, width; block_config, nclasses) - ResNeXt(layers) + layers = resnext(cardinality, width; block_config, nclasses) + ResNeXt(layers) end @functor ResNeXt @@ -92,11 +98,9 @@ end backbone(m::ResNeXt) = m.layers[1] classifier(m::ResNeXt) = m.layers[2] -const resnext_config = Dict( - 50 => (3, 4, 6, 3), - 101 => (3, 4, 23, 3), - 152 => (3, 8, 36, 3) -) +const resnext_config = Dict(50 => (3, 4, 6, 3), + 101 => (3, 4, 23, 3), + 152 => (3, 8, 36, 3)) """ ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) @@ -110,10 +114,10 @@ Set `pretrain = true` to load the model with pre-trained weights for ImageNet. See also [`Metalhead.resnext`](#). """ -function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) - @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" - - model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) - pretrain && loadpretrain!(model, string("ResNeXt", config)) - model -end \ No newline at end of file +function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, + nclasses = 1000) + @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" + model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) + pretrain && loadpretrain!(model, string("ResNeXt", config)) + model +end diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl index 169ad2e86..209dfb9a2 100644 --- a/src/convnets/squeezenet.jl +++ b/src/convnets/squeezenet.jl @@ -11,14 +11,14 @@ Create a fire module - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution """ function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes) - branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) - branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) - branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu) + branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) + branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) + branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu) - return Chain(branch_1, - Parallel(cat_channels, - branch_2, - branch_3)) + return Chain(branch_1, + Parallel(cat_channels, + branch_2, + branch_3)) end """ @@ -28,24 +28,24 @@ Create a SqueezeNet ([reference](https://arxiv.org/abs/1602.07360v4)). """ function squeezenet() - layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2), - MaxPool((3, 3), stride = 2), - fire(64, 16, 64, 64), - fire(128, 16, 64, 64), - MaxPool((3, 3), stride = 2), - fire(128, 32, 128, 128), - fire(256, 32, 128, 128), - MaxPool((3, 3), stride = 2), - fire(256, 48, 192, 192), - fire(384, 48, 192, 192), - fire(384, 64, 256, 256), - fire(512, 64, 256, 256), - Dropout(0.5), - Conv((1, 1), 512 => 1000, relu)), - AdaptiveMeanPool((1, 1)), - MLUtils.flatten) + layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2), + MaxPool((3, 3), stride = 2), + fire(64, 16, 64, 64), + fire(128, 16, 64, 64), + MaxPool((3, 3), stride = 2), + fire(128, 32, 128, 128), + fire(256, 32, 128, 128), + MaxPool((3, 3), stride = 2), + fire(256, 48, 192, 192), + fire(384, 48, 192, 192), + fire(384, 64, 256, 256), + fire(512, 64, 256, 256), + Dropout(0.5), + Conv((1, 1), 512 => 1000, relu)), + AdaptiveMeanPool((1, 1)), + MLUtils.flatten) - return layers + return layers end """ @@ -61,14 +61,13 @@ Set `pretrain=true` to load the model with pre-trained weights for ImageNet. See also [`squeezenet`](#). """ struct SqueezeNet - layers + layers::Any end function SqueezeNet(; pretrain = false) - layers = squeezenet() - pretrain && loadpretrain!(layers, "SqueezeNet") - - SqueezeNet(layers) + layers = squeezenet() + pretrain && loadpretrain!(layers, "SqueezeNet") + SqueezeNet(layers) end @functor SqueezeNet diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl index bdca0d9ee..2f8777297 100644 --- a/src/convnets/vgg.jl +++ b/src/convnets/vgg.jl @@ -11,18 +11,18 @@ A VGG block of convolution layers - `batchnorm`: set to `true` to include batch normalization after each convolution """ function vgg_block(ifilters, ofilters, depth, batchnorm) - k = (3,3) - p = (1,1) - layers = [] - for _ in 1:depth - if batchnorm - append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) - else - push!(layers, Conv(k, ifilters => ofilters, relu, pad = p)) + k = (3, 3) + p = (1, 1) + layers = [] + for _ in 1:depth + if batchnorm + append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) + else + push!(layers, Conv(k, ifilters => ofilters, relu, pad = p)) + end + ifilters = ofilters end - ifilters = ofilters - end - return layers + return layers end """ @@ -38,14 +38,14 @@ Create VGG convolution layers - `inchannels`: number of input channels """ function vgg_convolutional_layers(config, batchnorm, inchannels) - layers = [] - ifilters = inchannels - for c in config - append!(layers, vgg_block(ifilters, c..., batchnorm)) - push!(layers, MaxPool((2,2), stride=2)) - ifilters, _ = c - end - return layers + layers = [] + ifilters = inchannels + for c in config + append!(layers, vgg_block(ifilters, c..., batchnorm)) + push!(layers, MaxPool((2, 2), stride = 2)) + ifilters, _ = c + end + return layers end """ @@ -62,12 +62,12 @@ Create VGG classifier (fully connected) layers - `dropout`: the dropout level between each fully connected layer """ function vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(MLUtils.flatten, - Dense(Int(prod(imsize)), fcsize, relu), - Dropout(dropout), - Dense(fcsize, fcsize, relu), - Dropout(dropout), - Dense(fcsize, nclasses)) + return Chain(MLUtils.flatten, + Dense(Int(prod(imsize)), fcsize, relu), + Dropout(dropout), + Dense(fcsize, fcsize, relu), + Dropout(dropout), + Dense(fcsize, nclasses)) end """ @@ -88,16 +88,16 @@ Create a VGG model - `dropout`: dropout level between fully connected layers """ function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - conv = vgg_convolutional_layers(config, batchnorm, inchannels) - imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] - class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(Chain(conv), class) + conv = vgg_convolutional_layers(config, batchnorm, inchannels) + imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] + class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) + return Chain(Chain(conv), class) end -const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)], - :B => [(64,2), (128,2), (256,2), (512,2), (512,2)], - :D => [(64,2), (128,2), (256,3), (512,3), (512,3)], - :E => [(64,2), (128,2), (256,4), (512,4), (512,4)]) +const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)], + :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)], + :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)], + :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)]) const vgg_config = Dict(11 => :A, 13 => :B, @@ -105,7 +105,7 @@ const vgg_config = Dict(11 => :A, 19 => :E) struct VGG - layers + layers::Any end """ @@ -124,14 +124,14 @@ Construct a VGG model with the specified input image size. Typically, the image """ function VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - layers = vgg(imsize; config = config, - inchannels = inchannels, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = fcsize, - dropout = dropout) - - VGG(layers) + layers = vgg(imsize; config = config, + inchannels = inchannels, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = fcsize, + dropout = dropout) + + VGG(layers) end @functor VGG @@ -155,21 +155,19 @@ See also [`VGG`](#). - `pretrain`: set to `true` to load pre-trained model weights for ImageNet """ function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000) - @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" - - model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], - inchannels = 3, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = 4096, - dropout = 0.5) - - if pretrain && !batchnorm - loadpretrain!(model, string("VGG", depth)) - elseif pretrain - loadpretrain!(model, "VGG$(depth)-BN)") - end - model + @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" + model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], + inchannels = 3, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = 4096, + dropout = 0.5) + if pretrain && !batchnorm + loadpretrain!(model, string("VGG", depth)) + elseif pretrain + loadpretrain!(model, "VGG$(depth)-BN)") + end + model end # deprecations diff --git a/src/layers/attention.jl b/src/layers/attention.jl index 10baf73e9..917b58c88 100644 --- a/src/layers/attention.jl +++ b/src/layers/attention.jl @@ -10,10 +10,10 @@ Multi-head self-attention layer. - `projection`: projection layer to be used after self-attention """ struct MHAttention{P, Q, R} - nheads::Int - qkv_layer::P - attn_drop::Q - projection::R + nheads::Int + qkv_layer::P + attn_drop::Q + projection::R end """ @@ -28,31 +28,31 @@ Multi-head self-attention layer. - `attn_drop`: dropout rate after the self-attention layer - `proj_drop`: dropout rate after the projection layer """ -function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.) - @assert planes % nheads == 0 "planes should be divisible by nheads" - qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) - attn_drop = Dropout(attn_drop) - proj = Chain(Dense(planes, planes), Dropout(proj_drop)) - - MHAttention(nheads, qkv_layer, attn_drop, proj) +function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, + attn_drop = 0.0, proj_drop = 0.0) + @assert planes % nheads==0 "planes should be divisible by nheads" + qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) + attn_drop = Dropout(attn_drop) + proj = Chain(Dense(planes, planes), Dropout(proj_drop)) + MHAttention(nheads, qkv_layer, attn_drop, proj) end @functor MHAttention function (m::MHAttention)(x::AbstractArray{T, 3}) where {T} - nfeatures, seq_len, batch_size = size(x) - x_reshaped = reshape(x, nfeatures, seq_len * batch_size) - qkv = m.qkv_layer(x_reshaped) - qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) - query, key, value = chunk(qkv_reshaped, 3; dims = 4) - scale = convert(T, sqrt(size(query, 1) / m.nheads)) - key_reshaped = reshape( - permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size - ) - query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) - value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size)) - y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) - return reshape(y, :, seq_len, batch_size) + nfeatures, seq_len, batch_size = size(x) + x_reshaped = reshape(x, nfeatures, seq_len * batch_size) + qkv = m.qkv_layer(x_reshaped) + qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) + query, key, value = chunk(qkv_reshaped, 3; dims = 4) + scale = convert(T, sqrt(size(query, 1) / m.nheads)) + key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, + seq_len * batch_size) + query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) + value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + pre_projection = reshape(batched_mul(attention, value_reshaped), + (nfeatures, seq_len, batch_size)) + y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) + return reshape(y, :, seq_len, batch_size) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index ca30df8a4..8455a257e 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -24,28 +24,26 @@ Create a convolution + batch normalization pair with activation. """ function conv_bn(kernelsize, inplanes, outplanes, activation = relu; rev = false, preact = false, - initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1, + initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1, kwargs...) - layers = [] - - if rev - activations = (conv = activation, bn = identity) - bnplanes = inplanes - else - activations = (conv = identity, bn = activation) - bnplanes = outplanes - end - - if preact - rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : - activations = (conv = activation, bn = identity) - end - - push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) - push!(layers, BatchNorm(Int(bnplanes), activations.bn; - initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) - - return rev ? reverse(layers) : layers + layers = [] + if rev + activations = (conv = activation, bn = identity) + bnplanes = inplanes + else + activations = (conv = identity, bn = activation) + bnplanes = outplanes + end + if preact + rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : + activations = (conv = activation, bn = identity) + end + push!(layers, + Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) + push!(layers, + BatchNorm(Int(bnplanes), activations.bn; + initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) + return rev ? reverse(layers) : layers end """ @@ -77,18 +75,19 @@ See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1). - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) """ -depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; - rev = false, - initβ = Flux.zeros32, initγ = Flux.ones32, - ϵ = 1f-5, momentum = 1f-1, - stride = 1, kwargs...) = - vcat(conv_bn(kernelsize, inplanes, inplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum, - stride = stride, groups = Int(inplanes), kwargs...), - conv_bn((1, 1), inplanes, outplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum)) +function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; + rev = false, + initβ = Flux.zeros32, initγ = Flux.ones32, + ϵ = 1.0f-5, momentum = 1.0f-1, + stride = 1, kwargs...) + vcat(conv_bn(kernelsize, inplanes, inplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum, + stride = stride, groups = Int(inplanes), kwargs...), + conv_bn((1, 1), inplanes, outplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum)) +end """ skip_projection(inplanes, outplanes, downsample = false) @@ -101,9 +100,11 @@ Create a skip projection - `outplanes`: the number of output feature maps - `downsample`: set to `true` to downsample the input """ -skip_projection(inplanes, outplanes, downsample = false) = downsample ? - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +function skip_projection(inplanes, outplanes, downsample = false) + downsample ? + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +end # array -> PaddedView(0, array, outplanes) for zero padding arrays """ @@ -118,15 +119,16 @@ Create a identity projection - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#). """ function skip_identity(inplanes, outplanes) - if outplanes > inplanes - return Chain(MaxPool((1, 1), stride = 2), - y -> cat(y, zeros(eltype(y), - size(y, 1), - size(y, 2), - outplanes - inplanes, size(y, 4)); dims = 3)) - else - return identity - end + if outplanes > inplanes + return Chain(MaxPool((1, 1), stride = 2), + y -> cat(y, + zeros(eltype(y), + size(y, 1), + size(y, 2), + outplanes - inplanes, size(y, 4)); dims = 3)) + else + return identity + end end skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes) @@ -142,10 +144,11 @@ Squeeze and excitation layer used by MobileNet variants (must be >= 1) """ function squeeze_excite(channels, reduction = 4) - @assert (reduction >= 1) "`reduction` must be >= 1" - SkipConnection(Chain(AdaptiveMeanPool((1, 1)), - conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)..., - conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*) + @assert (reduction>=1) "`reduction` must be >= 1" + SkipConnection(Chain(AdaptiveMeanPool((1, 1)), + conv_bn((1, 1), channels, channels ÷ reduction, relu; + bias = false)..., + conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*) end """ @@ -166,21 +169,22 @@ Create a basic inverted residual block for MobileNet variants in a squeeze and excite layer (see [`squeeze_excite`](#)). Must be >= 1 or `nothing` for no squeeze and excite layer. """ -function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu; +function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, + activation = relu; stride, reduction = nothing) - @assert stride in [1, 2] "`stride` has to be 1 or 2" - - pad = @. (kernel_size - 1) ÷ 2 - conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) - selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) - - invres = Chain(conv1, - conv_bn(kernel_size, hidden_planes, hidden_planes, activation; - bias = false, stride, pad = pad, groups = hidden_planes)..., - selayer, - conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) - - (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres + @assert stride in [1, 2] "`stride` has to be 1 or 2" + pad = @. (kernel_size - 1) ÷ 2 + conv1 = (inplanes == hidden_planes) ? identity : + Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) + selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) + invres = Chain(conv1, + conv_bn(kernel_size, hidden_planes, hidden_planes, activation; + bias = false, stride, pad = pad, groups = hidden_planes)..., + selayer, + conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) + + (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres +end +function invertedresidual(kernel_size::Integer, args...; kwargs...) + invertedresidual((kernel_size, kernel_size), args...; kwargs...) end -invertedresidual(kernel_size::Integer, args...; kwargs...) = - invertedresidual((kernel_size, kernel_size), args...; kwargs...) diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl index 06116bdc2..37b6f4f7b 100644 --- a/src/layers/embeddings.jl +++ b/src/layers/embeddings.jl @@ -20,16 +20,13 @@ patches. function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, norm_layer = planes -> identity, flatten = true) - - im_height, im_width = imsize - patch_height, patch_width = patch_size - - @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) - "Image dimensions must be divisible by the patch size." - - return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), - flatten ? _flatten_spatial : identity, - norm_layer(embedplanes)) + im_height, im_width = imsize + patch_height, patch_width = patch_size + @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) + "Image dimensions must be divisible by the patch size." + return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), + flatten ? _flatten_spatial : identity, + norm_layer(embedplanes)) end """ @@ -38,11 +35,13 @@ end Positional embedding layer used by many vision transformer-like models. """ struct ViPosEmbedding{T} - vectors::T + vectors::T end -ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) = - ViPosEmbedding(init((embedsize, npatches))) +function ViPosEmbedding(embedsize::Integer, npatches::Integer; + init = (dims::Dims{2}) -> rand(Float32, dims)) + ViPosEmbedding(init((embedsize, npatches))) +end (p::ViPosEmbedding)(x) = x .+ p.vectors @@ -54,7 +53,7 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models. """ struct ClassTokens{T} - token::T + token::T end ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1)) diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl index ca8f38f97..f14ba8a8c 100644 --- a/src/layers/mlp.jl +++ b/src/layers/mlp.jl @@ -11,10 +11,10 @@ Feedforward block used in many MLPMixer-like and vision-transformer models. - `dropout`: Dropout rate. - `activation`: Activation function to use. """ -function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; - dropout = 0., activation = gelu) - Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), - Dense(hidden_planes, outplanes), Dropout(dropout)) +function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; + dropout = 0.0, activation = gelu) + Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), + Dense(hidden_planes, outplanes), Dropout(dropout)) end """ @@ -33,12 +33,12 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP - `activation`: Activation function to use. """ function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer, - outplanes::Integer = inplanes; dropout = 0., activation = gelu) - @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP" - return Chain(Dense(inplanes, hidden_planes, activation), - Dropout(dropout), - gate_layer(hidden_planes), - Dense(hidden_planes ÷ 2, outplanes), - Dropout(dropout)) + outplanes::Integer = inplanes; dropout = 0.0, activation = gelu) + @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP" + return Chain(Dense(inplanes, hidden_planes, activation), + Dropout(dropout), + gate_layer(hidden_planes), + Dense(hidden_planes ÷ 2, outplanes), + Dropout(dropout)) end gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index a7bce3e6c..42405b563 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -12,16 +12,16 @@ Note that this is specifically for inputs with 4 dimensions in the format (H, W, C, N) where H, W are the height and width of the input, C is the number of channels, and N is the batch size. """ -struct ChannelLayerNorm{D,T} - diag::D - ϵ::T +struct ChannelLayerNorm{D, T} + diag::D + ϵ::T end @functor ChannelLayerNorm (m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ)) -function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5) - diag = Flux.Scale(1, 1, sz, λ) - return ChannelLayerNorm(diag, ϵ) +function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5) + diag = Flux.Scale(1, 1, sz, λ) + return ChannelLayerNorm(diag, ϵ) end diff --git a/src/layers/others.jl b/src/layers/others.jl index 366b273e4..249cacd0e 100644 --- a/src/layers/others.jl +++ b/src/layers/others.jl @@ -8,8 +8,9 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`" - `planes`: Size of channel dimension in the input. - `λ`: initialisation value for the learnable diagonal matrix. """ -LayerScale(planes::Integer, λ) = +function LayerScale(planes::Integer, λ) λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity +end """ DropPath(p) @@ -20,4 +21,4 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥ # Arguments - `p`: rate of Stochastic Depth. """ -DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity \ No newline at end of file +DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl index 880486dc2..a88118060 100644 --- a/src/other/mlpmixer.jl +++ b/src/other/mlpmixer.jl @@ -15,17 +15,17 @@ Creates a feedforward block for the MLPMixer architecture. - `drop_path_rate`: Stochastic depth rate - `activation`: the activation function to use in the MLP blocks """ -function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu) - tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] - return Chain(SkipConnection(Chain(LayerNorm(planes), - swapdims((2, 1, 3)), - mlp_layer(npatches, tokenplanes; activation, dropout), - swapdims((2, 1, 3)), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(LayerNorm(planes), - mlp_layer(planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +)) +function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, + dropout = 0.0, drop_path_rate = 0.0, activation = gelu) + tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] + return Chain(SkipConnection(Chain(LayerNorm(planes), + swapdims((2, 1, 3)), + mlp_layer(npatches, tokenplanes; activation, dropout), + swapdims((2, 1, 3)), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(LayerNorm(planes), + mlp_layer(planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +)) end """ @@ -50,27 +50,30 @@ Creates a model with the MLPMixer architecture. - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if not specified. """ -function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm, - patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0., +function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, + norm_layer = LayerNorm, + patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0, depth = 12, nclasses = 1000, kwargs...) - npatches = prod(imsize .÷ patch_size) - dp_rates = LinRange{Float32}(0., drop_path_rate, depth) - layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...) - for i in 1:depth])) - - classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses)) - return Chain(layers, classification_head) + npatches = prod(imsize .÷ patch_size) + dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth) + layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], + kwargs...) + for i in 1:depth])) + + classification_head = Chain(norm_layer(embedplanes), seconddimmean, + Dense(embedplanes, nclasses)) + return Chain(layers, classification_head) end # Configurations for MLPMixer models -mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), - :base => Dict(:depth => 12, :planes => 768), +mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), + :base => Dict(:depth => 12, :planes => 768), :large => Dict(:depth => 24, :planes => 1024), - :huge => Dict(:depth => 32, :planes => 1280)) + :huge => Dict(:depth => 32, :planes => 1280)) struct MLPMixer - layers + layers::Any end """ @@ -90,12 +93,13 @@ Creates a model with the MLPMixer architecture. See also [`Metalhead.mlpmixer`](#). """ function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses) - MLPMixer(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, + nclasses) + MLPMixer(layers) end @functor MLPMixer @@ -124,21 +128,22 @@ Creates a block for the ResMixer architecture. - `λ`: initialisation constant for the LayerScale """ function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4) -return Chain(SkipConnection(Chain(Flux.Scale(planes), - swapdims((2, 1, 3)), - Dense(npatches, npatches), - swapdims((2, 1, 3)), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(Flux.Scale(planes), - mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +)) + dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4) + return Chain(SkipConnection(Chain(Flux.Scale(planes), + swapdims((2, 1, 3)), + Dense(npatches, npatches), + swapdims((2, 1, 3)), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(Flux.Scale(planes), + mlp_layer(planes, Int(mlp_ratio * planes); dropout, + activation), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +)) end struct ResMLP - layers + layers::Any end """ @@ -158,13 +163,13 @@ Creates a model with the ResMLP architecture. See also [`Metalhead.mlpmixer`](#). """ function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, - drop_path_rate, depth, nclasses) - ResMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, + drop_path_rate, depth, nclasses) + ResMLP(layers) end @functor ResMLP @@ -185,8 +190,8 @@ Creates a spatial gating unit as described in the gMLP paper. - `proj`: the projection layer to use """ struct SpatialGatingUnit{T, F} - norm::T - proj::F + norm::T + proj::F end """ @@ -201,19 +206,19 @@ Creates a spatial gating unit as described in the gMLP paper. - `norm_layer`: the normalisation layer to use """ function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm) - gateplanes = planes ÷ 2 - norm = norm_layer(gateplanes) - proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) - return SpatialGatingUnit(norm, proj) + gateplanes = planes ÷ 2 + norm = norm_layer(gateplanes) + proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) + return SpatialGatingUnit(norm, proj) end @functor SpatialGatingUnit function (m::SpatialGatingUnit)(x) - u, v = chunk(x, 2; dims = 1) - v = m.norm(v) - v = m.proj(permutedims(v, (2, 1, 3))) - return u .* permutedims(v, (2, 1, 3)) + u, v = chunk(x, 2; dims = 1) + v = m.norm(v) + v = m.proj(permutedims(v, (2, 1, 3))) + return u .* permutedims(v, (2, 1, 3)) end """ @@ -235,17 +240,18 @@ Creates a feedforward block based on the gMLP model architecture described in th - `activation`: the activation function to use in the MLP blocks """ function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm, - mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0., + mlp_layer = gated_mlp_block, dropout = 0.0, + drop_path_rate = 0.0, activation = gelu) - channelplanes = Int(mlp_ratio * planes) - sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) - return SkipConnection(Chain(norm_layer(planes), - mlp_layer(sgu, planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +) + channelplanes = Int(mlp_ratio * planes) + sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) + return SkipConnection(Chain(norm_layer(planes), + mlp_layer(sgu, planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +) end struct gMLP - layers + layers::Any end """ @@ -265,14 +271,13 @@ Creates a model with the gMLP architecture. See also [`Metalhead.mlpmixer`](#). """ function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, - patch_size, embedplanes, drop_path_rate, depth, nclasses) - - gMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, + patch_size, embedplanes, drop_path_rate, depth, nclasses) + gMLP(layers) end @functor gMLP diff --git a/src/pretrain.jl b/src/pretrain.jl index 97ab7398e..24e6d176d 100644 --- a/src/pretrain.jl +++ b/src/pretrain.jl @@ -4,17 +4,17 @@ Load the pre-trained weights for `model` using the stored artifacts. """ function weights(model) - try - path = joinpath(@artifact_str(model), "$model.bson") - artifact = BSON.load(path, @__MODULE__) - if haskey(artifact, :model) - return artifact[:model] - else - throw(ArgumentError("No pre-trained weights available for $model.")) + try + path = joinpath(@artifact_str(model), "$model.bson") + artifact = BSON.load(path, @__MODULE__) + if haskey(artifact, :model) + return artifact[:model] + else + throw(ArgumentError("No pre-trained weights available for $model.")) + end + catch e + throw(ArgumentError("No pre-trained weights available for $model.")) end - catch e - throw(ArgumentError("No pre-trained weights available for $model.")) - end end """ diff --git a/src/utilities.jl b/src/utilities.jl index 39dbdd3b2..6adc1ec87 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -4,9 +4,9 @@ seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2) # utility function for making sure that all layers have a channel size divisible by 8 # used by MobileNet variants function _round_channels(channels, divisor, min_value = divisor) - new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) - # Make sure that round down does not go down by more than 10% - return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels + new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) + # Make sure that round down does not go down by more than 10% + return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels end """ @@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm) # Utility function for pretty printing large models function _maybe_big_show(io, model) - if isdefined(Flux, :_big_show) - if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL - Flux._big_show(io, model) - else - show(io, model) + if isdefined(Flux, :_big_show) + if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL + Flux._big_show(io, model) + else + show(io, model) + end end - end end diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl index 55b3e3d30..547ca1612 100644 --- a/src/vit-based/vit.jl +++ b/src/vit-based/vit.jl @@ -11,13 +11,15 @@ Transformer as used in the base ViT architecture. - `mlp_ratio`: ratio of MLP layers to the number of input channels - `dropout`: dropout rate """ -function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.) - layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout, - proj_drop = dropout)), +), - SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes); - dropout)), +)) - for _ in 1:depth] - Chain(layers) +function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0) + layers = [Chain(SkipConnection(prenorm(planes, + MHAttention(planes, nheads; attn_drop = dropout, + proj_drop = dropout)), +), + SkipConnection(prenorm(planes, + mlp_block(planes, floor(Int, mlp_ratio * planes); + dropout)), +)) + for _ in 1:depth] + Chain(layers) end """ @@ -62,8 +64,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), :base => (depth = 12, embedplanes = 768, nheads = 12), :large => (depth = 24, embedplanes = 1024, nheads = 16), :huge => (depth = 32, embedplanes = 1280, nheads = 16), - :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11), - :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13)) + :giant => (depth = 40, embedplanes = 1408, nheads = 16, + mlp_ratio = 48 / 11), + :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, + mlp_ratio = 64 / 13)) """ ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3, @@ -83,16 +87,16 @@ Creates a Vision Transformer (ViT) model. See also [`Metalhead.vit`](#). """ struct ViT - layers + layers::Any end function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3, patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000) - @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" - kwargs = vit_configs[mode] - layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) + @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" + kwargs = vit_configs[mode] + layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) - ViT(layers) + ViT(layers) end (m::ViT)(x) = m.layers(x) diff --git a/test/convnets.jl b/test/convnets.jl index 3540c3e9f..f62ecc3fd 100644 --- a/test/convnets.jl +++ b/test/convnets.jl @@ -5,202 +5,192 @@ using Flux PRETRAINED_MODELS = [] @testset "AlexNet" begin - model = AlexNet() - @test size(model(x_256)) == (1000, 1) - @test_throws ArgumentError AlexNet(pretrain = true) - @test gradtest(model, x_256) + model = AlexNet() + @test size(model(x_256)) == (1000, 1) + @test_throws ArgumentError AlexNet(pretrain = true) + @test gradtest(model, x_256) end GC.safepoint() GC.gc() -@testset "VGG" begin - @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false] +@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], + bn in [true, false] + m = VGG(sz, batchnorm = bn) @test size(m(x_224)) == (1000, 1) if (VGG, sz, bn) in PRETRAINED_MODELS - @test (VGG(sz, batchnorm = bn, pretrain = true); true) + @test (VGG(sz, batchnorm = bn, pretrain = true); true) else - @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) + @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() @testset "ResNet" begin - @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] - m = ResNet(sz) - - @test size(m(x_256)) == (1000, 1) - if (ResNet, sz) in PRETRAINED_MODELS - @test (ResNet(sz, pretrain = true); true) - else - @test_throws ArgumentError ResNet(sz, pretrain = true) + @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] + m = ResNet(sz) + + @test size(m(x_256)) == (1000, 1) + if (ResNet, sz) in PRETRAINED_MODELS + @test (ResNet(sz, pretrain = true); true) + else + @test_throws ArgumentError ResNet(sz, pretrain = true) + end + @test gradtest(m, x_256) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_256) - GC.safepoint() - GC.gc() - end - @testset "Shortcut C" begin - m = Metalhead.resnet(Metalhead.basicblock, :C; - channel_config = [1, 1], - block_config = [2, 2, 2, 2]) + @testset "Shortcut C" begin + m = Metalhead.resnet(Metalhead.basicblock, :C; + channel_config = [1, 1], + block_config = [2, 2, 2, 2]) - @test size(m(x_256)) == (1000, 1) - @test gradtest(m, x_256) - end + @test size(m(x_256)) == (1000, 1) + @test gradtest(m, x_256) + end end GC.safepoint() GC.gc() -@testset "ResNeXt" begin - @testset for depth in [50, 101, 152] +@testset "ResNeXt" begin @testset for depth in [50, 101, 152] m = ResNeXt(depth) @test size(m(x_224)) == (1000, 1) if ResNeXt in PRETRAINED_MODELS - @test (ResNeXt(depth, pretrain = true); true) + @test (ResNeXt(depth, pretrain = true); true) else - @test_throws ArgumentError ResNeXt(depth, pretrain = true) + @test_throws ArgumentError ResNeXt(depth, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() @testset "GoogLeNet" begin - m = GoogLeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (GoogLeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = GoogLeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (GoogLeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "Inception3" begin - m = Inception3() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError Inception3(pretrain = true) - @test gradtest(m, x_224) + m = Inception3() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError Inception3(pretrain = true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "SqueezeNet" begin - m = SqueezeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (SqueezeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = SqueezeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (SqueezeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() -@testset "DenseNet" begin - @testset for sz in [121, 161, 169, 201] +@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201] m = DenseNet(sz) @test size(m(x_224)) == (1000, 1) if (DenseNet, sz) in PRETRAINED_MODELS - @test (DenseNet(sz, pretrain = true); true) + @test (DenseNet(sz, pretrain = true); true) else - @test_throws ArgumentError DenseNet(sz, pretrain = true) + @test_throws ArgumentError DenseNet(sz, pretrain = true) end @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end GC.safepoint() GC.gc() -@testset "MobileNet" verbose = true begin - @testset "MobileNetv1" begin - m = MobileNetv1() - - @test size(m(x_224)) == (1000, 1) - if MobileNetv1 in PRETRAINED_MODELS - @test (MobileNetv1(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv1(pretrain = true) +@testset "MobileNet" verbose=true begin + @testset "MobileNetv1" begin + m = MobileNetv1() + + @test size(m(x_224)) == (1000, 1) + if MobileNetv1 in PRETRAINED_MODELS + @test (MobileNetv1(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv1(pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end - GC.safepoint() - GC.gc() + GC.safepoint() + GC.gc() - @testset "MobileNetv2" begin - m = MobileNetv2() + @testset "MobileNetv2" begin + m = MobileNetv2() - @test size(m(x_224)) == (1000, 1) - if MobileNetv2 in PRETRAINED_MODELS - @test (MobileNetv2(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv2(pretrain = true) + @test size(m(x_224)) == (1000, 1) + if MobileNetv2 in PRETRAINED_MODELS + @test (MobileNetv2(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv2(pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end - - GC.safepoint() - GC.gc() - - @testset "MobileNetv3" verbose = true begin - @testset for mode in [:small, :large] - m = MobileNetv3(mode) - - @test size(m(x_224)) == (1000, 1) - if MobileNetv3 in PRETRAINED_MODELS - @test (MobileNetv3(mode; pretrain = true); true) - else - @test_throws ArgumentError MobileNetv3(mode; pretrain = true) - end - @test gradtest(m, x_224) - end - end + + GC.safepoint() + GC.gc() + + @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large] + m = MobileNetv3(mode) + + @test size(m(x_224)) == (1000, 1) + if MobileNetv3 in PRETRAINED_MODELS + @test (MobileNetv3(mode; pretrain = true); true) + else + @test_throws ArgumentError MobileNetv3(mode; pretrain = true) + end + @test gradtest(m, x_224) + end end end GC.safepoint() GC.gc() -@testset "ConvNeXt" verbose = true begin - @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] +@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] @testset for drop_path_rate in [0.0, 0.5] - m = ConvNeXt(mode; drop_path_rate) + m = ConvNeXt(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end GC.safepoint() GC.gc() -@testset "ConvMixer" verbose = true begin - @testset for mode in [:small, :base, :large] +@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large] m = ConvMixer(mode) @test size(m(x_224)) == (1000, 1) @test gradtest(m, x_224) GC.safepoint() GC.gc() - end -end +end end diff --git a/test/other.jl b/test/other.jl index 0162bc4bc..db0bf223c 100644 --- a/test/other.jl +++ b/test/other.jl @@ -1,38 +1,32 @@ using Metalhead, Test using Flux -@testset "MLPMixer" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = MLPMixer(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = MLPMixer(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end -@testset "ResMLP" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = ResMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = ResMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end -@testset "gMLP" begin - @testset for mode in [:small, :base, :large] # :huge] +@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge] @testset for drop_path_rate in [0.0, 0.5] - m = gMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + m = gMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - end -end +end end diff --git a/test/runtests.jl b/test/runtests.jl index 6dd4a1aa4..61af837a7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,33 +3,27 @@ using Flux using Flux: Zygote function gradtest(model, input) - y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) - gs = pb(ones(Float32, size(y))) + y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) + gs = pb(ones(Float32, size(y))) - # if we make it to here with no error, success! - return true + # if we make it to here with no error, success! + return true end x_224 = rand(Float32, 224, 224, 3, 1) x_256 = rand(Float32, 256, 256, 3, 1) # CNN tests -@testset verbose = true "ConvNets" begin - include("convnets.jl") -end +@testset verbose=true "ConvNets" begin include("convnets.jl") end GC.safepoint() GC.gc() # Other tests -@testset verbose = true "Other" begin - include("other.jl") -end +@testset verbose=true "Other" begin include("other.jl") end GC.safepoint() GC.gc() # ViT tests -@testset verbose = true "ViTs" begin - include("vit-based.jl") -end +@testset verbose=true "ViTs" begin include("vit-based.jl") end diff --git a/test/vit-based.jl b/test/vit-based.jl index 20b6ecb86..ebd1a0fc2 100644 --- a/test/vit-based.jl +++ b/test/vit-based.jl @@ -1,12 +1,10 @@ using Metalhead, Test using Flux -@testset "ViT" begin - for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] +@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] m = ViT(mode) @test size(m(x_256)) == (1000, 1) @test gradtest(m, x_256) GC.safepoint() GC.gc() - end -end +end end From cb3cd285b6d0e7e66730239866078f6c5c85a11c Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Mon, 30 May 2022 06:20:20 +0530 Subject: [PATCH 6/8] Create .git-blame-ignore-revs --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..d62e45914 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,3 @@ +# .git-blame-ignore-revs +# Switched to SciML style for code +fd2869f57c66fa650547cd8581feeba9eda08b88 From e4209fca4d35b725983f424158c650cea8948238 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 11 Jun 2022 14:24:49 +0530 Subject: [PATCH 7/8] Fix formatting for tests --- src/vit-based/vit.jl | 21 ++++--- test/convnets.jl | 127 +++++++++++++++++++++---------------------- test/other.jl | 54 ++++++++++-------- test/runtests.jl | 12 +++- test/vit-based.jl | 16 +++--- 5 files changed, 121 insertions(+), 109 deletions(-) diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl index 547ca1612..1ebce1bbe 100644 --- a/src/vit-based/vit.jl +++ b/src/vit-based/vit.jl @@ -46,17 +46,16 @@ Creates a Vision Transformer (ViT) model. function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1, emb_dropout = 0.1, pool = :class, nclasses = 1000) - - @assert pool in [:class, :mean] - "Pool type must be either :class (class token) or :mean (mean pooling)" - npatches = prod(imsize .÷ patch_size) - return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - ClassTokens(embedplanes), - ViPosEmbedding(embedplanes, npatches + 1), - Dropout(emb_dropout), - transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), - (pool == :class) ? x -> x[:, 1, :] : seconddimmean), - Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) + @assert pool in [:class, :mean] + "Pool type must be either :class (class token) or :mean (mean pooling)" + npatches = prod(imsize .÷ patch_size) + return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + ClassTokens(embedplanes), + ViPosEmbedding(embedplanes, npatches + 1), + Dropout(emb_dropout), + transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), + (pool == :class) ? x -> x[:, 1, :] : seconddimmean), + Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) end vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), diff --git a/test/convnets.jl b/test/convnets.jl index f62ecc3fd..7be6d70bc 100644 --- a/test/convnets.jl +++ b/test/convnets.jl @@ -14,21 +14,20 @@ end GC.safepoint() GC.gc() -@testset "VGG" begin @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], - bn in [true, false] - - m = VGG(sz, batchnorm = bn) - - @test size(m(x_224)) == (1000, 1) - if (VGG, sz, bn) in PRETRAINED_MODELS - @test (VGG(sz, batchnorm = bn, pretrain = true); true) - else - @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) +@testset "VGG" begin + @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false] + m = VGG(sz, batchnorm = bn) + @test size(m(x_224)) == (1000, 1) + if (VGG, sz, bn) in PRETRAINED_MODELS + @test (VGG(sz, batchnorm = bn, pretrain = true); true) + else + @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() -end end +end GC.safepoint() GC.gc() @@ -36,7 +35,6 @@ GC.gc() @testset "ResNet" begin @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] m = ResNet(sz) - @test size(m(x_256)) == (1000, 1) if (ResNet, sz) in PRETRAINED_MODELS @test (ResNet(sz, pretrain = true); true) @@ -52,7 +50,6 @@ GC.gc() m = Metalhead.resnet(Metalhead.basicblock, :C; channel_config = [1, 1], block_config = [2, 2, 2, 2]) - @test size(m(x_256)) == (1000, 1) @test gradtest(m, x_256) end @@ -61,19 +58,20 @@ end GC.safepoint() GC.gc() -@testset "ResNeXt" begin @testset for depth in [50, 101, 152] - m = ResNeXt(depth) - - @test size(m(x_224)) == (1000, 1) - if ResNeXt in PRETRAINED_MODELS - @test (ResNeXt(depth, pretrain = true); true) - else - @test_throws ArgumentError ResNeXt(depth, pretrain = true) +@testset "ResNeXt" begin + @testset for depth in [50, 101, 152] + m = ResNeXt(depth) + @test size(m(x_224)) == (1000, 1) + if ResNeXt in PRETRAINED_MODELS + @test (ResNeXt(depth, pretrain = true); true) + else + @test_throws ArgumentError ResNeXt(depth, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() -end end +end GC.safepoint() GC.gc() @@ -108,19 +106,20 @@ end GC.safepoint() GC.gc() -@testset "DenseNet" begin @testset for sz in [121, 161, 169, 201] - m = DenseNet(sz) - - @test size(m(x_224)) == (1000, 1) - if (DenseNet, sz) in PRETRAINED_MODELS - @test (DenseNet(sz, pretrain = true); true) - else - @test_throws ArgumentError DenseNet(sz, pretrain = true) +@testset "DenseNet" begin + @testset for sz in [121, 161, 169, 201] + m = DenseNet(sz) + @test size(m(x_224)) == (1000, 1) + if (DenseNet, sz) in PRETRAINED_MODELS + @test (DenseNet(sz, pretrain = true); true) + else + @test_throws ArgumentError DenseNet(sz, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() -end end +end GC.safepoint() GC.gc() @@ -128,7 +127,6 @@ GC.gc() @testset "MobileNet" verbose=true begin @testset "MobileNetv1" begin m = MobileNetv1() - @test size(m(x_224)) == (1000, 1) if MobileNetv1 in PRETRAINED_MODELS @test (MobileNetv1(pretrain = true); true) @@ -143,7 +141,6 @@ GC.gc() @testset "MobileNetv2" begin m = MobileNetv2() - @test size(m(x_224)) == (1000, 1) if MobileNetv2 in PRETRAINED_MODELS @test (MobileNetv2(pretrain = true); true) @@ -156,39 +153,41 @@ GC.gc() GC.safepoint() GC.gc() - @testset "MobileNetv3" verbose=true begin @testset for mode in [:small, :large] - m = MobileNetv3(mode) - - @test size(m(x_224)) == (1000, 1) - if MobileNetv3 in PRETRAINED_MODELS - @test (MobileNetv3(mode; pretrain = true); true) - else - @test_throws ArgumentError MobileNetv3(mode; pretrain = true) + @testset "MobileNetv3" verbose=true begin + @testset for mode in [:small, :large] + m = MobileNetv3(mode) + @test size(m(x_224)) == (1000, 1) + if MobileNetv3 in PRETRAINED_MODELS + @test (MobileNetv3(mode; pretrain = true); true) + else + @test_throws ArgumentError MobileNetv3(mode; pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end end + end end GC.safepoint() GC.gc() -@testset "ConvNeXt" verbose=true begin @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] - @testset for drop_path_rate in [0.0, 0.5] - m = ConvNeXt(mode; drop_path_rate) - - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end -end end +@testset "ConvNeXt" verbose=true begin + @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] + @testset for drop_path_rate in [0.0, 0.5] + m = ConvNeXt(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end + end +end GC.safepoint() GC.gc() -@testset "ConvMixer" verbose=true begin @testset for mode in [:small, :base, :large] +@testset "ConvMixer" verbose=true begin + @testset for mode in [:small, :base, :large] m = ConvMixer(mode) - @test size(m(x_224)) == (1000, 1) @test gradtest(m, x_224) GC.safepoint() diff --git a/test/other.jl b/test/other.jl index db0bf223c..0d3727f05 100644 --- a/test/other.jl +++ b/test/other.jl @@ -1,32 +1,38 @@ using Metalhead, Test using Flux -@testset "MLPMixer" begin @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = MLPMixer(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() +@testset "MLPMixer" begin + @testset for mode in [:small, :base, :large] # :huge] + @testset for drop_path_rate in [0.0, 0.5] + m = MLPMixer(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end -end end +end -@testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = ResMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() +@testset "ResMLP" begin + @testset for mode in [:small, :base, :large] # :huge] + @testset for drop_path_rate in [0.0, 0.5] + m = ResMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end -end end +end -@testset "gMLP" begin @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = gMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() +@testset "gMLP" begin + @testset for mode in [:small, :base, :large] # :huge] + @testset for drop_path_rate in [0.0, 0.5] + m = gMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end -end end +end diff --git a/test/runtests.jl b/test/runtests.jl index 61af837a7..79841244c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -14,16 +14,22 @@ x_224 = rand(Float32, 224, 224, 3, 1) x_256 = rand(Float32, 256, 256, 3, 1) # CNN tests -@testset verbose=true "ConvNets" begin include("convnets.jl") end +@testset verbose=true "ConvNets" begin + include("convnets.jl") +end GC.safepoint() GC.gc() # Other tests -@testset verbose=true "Other" begin include("other.jl") end +@testset verbose=true "Other" begin + include("other.jl") +end GC.safepoint() GC.gc() # ViT tests -@testset verbose=true "ViTs" begin include("vit-based.jl") end +@testset verbose=true "ViTs" begin + include("vit-based.jl") +end diff --git a/test/vit-based.jl b/test/vit-based.jl index ebd1a0fc2..cdaffc430 100644 --- a/test/vit-based.jl +++ b/test/vit-based.jl @@ -1,10 +1,12 @@ using Metalhead, Test using Flux -@testset "ViT" begin for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] - m = ViT(mode) - @test size(m(x_256)) == (1000, 1) - @test gradtest(m, x_256) - GC.safepoint() - GC.gc() -end end +@testset "ViT" begin + for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] + m = ViT(mode) + @test size(m(x_256)) == (1000, 1) + @test gradtest(m, x_256) + GC.safepoint() + GC.gc() + end +end From 562f61d690ec34a1ea97bc408bd0c8db81684a28 Mon Sep 17 00:00:00 2001 From: Abhirath Anand <74202102+theabhirath@users.noreply.github.com> Date: Sat, 11 Jun 2022 14:29:21 +0530 Subject: [PATCH 8/8] Bump version to generate dev docs without error --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index c010c513d..69f35b397 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Metalhead" uuid = "dbeba491-748d-5e0e-a39e-b530a07fa0cc" -version = "0.7.1" +version = "0.7.1-DEV" [deps] Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"