diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 000000000..07fcf66f1 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,9 @@ +style = "sciml" +whitespace_in_kwargs = true +format_docstrings = true +always_for_in = true +join_lines_based_on_source = true +separate_kwargs_with_semicolon = true +always_use_return = true +margin = 92 +indent = 4 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..ae9c21381 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,4 @@ +# .git-blame-ignore-revs +# Switched to SciML style for code +d5d28f0ef6e1e253ecf3fdbbec2f511836c8767b +70d639de532b046980cbea8d17fb1829e04cccfe diff --git a/Project.toml b/Project.toml index c010c513d..aeb660d6a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Metalhead" uuid = "dbeba491-748d-5e0e-a39e-b530a07fa0cc" -version = "0.7.1" +version = "0.7.2-DEV" [deps] Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" @@ -16,7 +16,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" BSON = "0.3.2" Flux = "0.13" Functors = "0.2" -MLUtils = "0.2" +MLUtils = "0.2.6" NNlib = "0.7.34, 0.8" julia = "1.6" diff --git a/src/Metalhead.jl b/src/Metalhead.jl index a0fb3785a..e465b6981 100644 --- a/src/Metalhead.jl +++ b/src/Metalhead.jl @@ -37,22 +37,23 @@ include("vit-based/vit.jl") include("pretrain.jl") -export AlexNet, - VGG, VGG11, VGG13, VGG16, VGG19, - ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, - GoogLeNet, Inception3, SqueezeNet, - DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, - ResNeXt, - MobileNetv1, MobileNetv2, MobileNetv3, - MLPMixer, ResMLP, gMLP, - ViT, - ConvNeXt, ConvMixer +export AlexNet, + VGG, VGG11, VGG13, VGG16, VGG19, + ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, + GoogLeNet, Inception3, SqueezeNet, + DenseNet, DenseNet121, DenseNet161, DenseNet169, DenseNet201, + ResNeXt, + MobileNetv1, MobileNetv2, MobileNetv3, + MLPMixer, ResMLP, gMLP, + ViT, + ConvNeXt, ConvMixer # use Flux._big_show to pretty print large models -for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, :ResNeXt, +for T in (:AlexNet, :VGG, :ResNet, :GoogLeNet, :Inception3, :SqueezeNet, :DenseNet, + :ResNeXt, :MobileNetv1, :MobileNetv2, :MobileNetv3, :MLPMixer, :ResMLP, :gMLP, :ViT, :ConvNeXt, :ConvMixer) - @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) + @eval Base.show(io::IO, ::MIME"text/plain", model::$T) = _maybe_big_show(io, model) end end # module diff --git a/src/convnets/alexnet.jl b/src/convnets/alexnet.jl index ea3962c2a..405272dd2 100644 --- a/src/convnets/alexnet.jl +++ b/src/convnets/alexnet.jl @@ -5,26 +5,27 @@ Create an AlexNet model ([reference](https://papers.nips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf)). # Arguments -- `nclasses`: the number of output classes + + - `nclasses`: the number of output classes """ function alexnet(; nclasses = 1000) - layers = Chain(Chain(Conv((11, 11), 3 => 64, stride = (4, 4), relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((5, 5), 64 => 192, relu, pad = (2, 2)), - MaxPool((3, 3), stride = (2, 2)), - Conv((3, 3), 192 => 384, relu, pad = (1, 1)), - Conv((3, 3), 384 => 256, relu, pad = (1, 1)), - Conv((3, 3), 256 => 256, relu, pad = (1, 1)), - MaxPool((3, 3), stride = (2, 2)), - AdaptiveMeanPool((6,6))), - Chain(MLUtils.flatten, - Dropout(0.5), - Dense(256 * 6 * 6, 4096, relu), - Dropout(0.5), - Dense(4096, 4096, relu), - Dense(4096, nclasses))) - - return layers + layers = Chain(Chain(Conv((11, 11), 3 => 64, relu; stride = (4, 4), pad = (2, 2)), + MaxPool((3, 3); stride = (2, 2)), + Conv((5, 5), 64 => 192, relu; pad = (2, 2)), + MaxPool((3, 3); stride = (2, 2)), + Conv((3, 3), 192 => 384, relu; pad = (1, 1)), + Conv((3, 3), 384 => 256, relu; pad = (1, 1)), + Conv((3, 3), 256 => 256, relu; pad = (1, 1)), + MaxPool((3, 3); stride = (2, 2)), + AdaptiveMeanPool((6, 6))), + Chain(MLUtils.flatten, + Dropout(0.5), + Dense(256 * 6 * 6, 4096, relu), + Dropout(0.5), + Dense(4096, 4096, relu), + Dense(4096, nclasses))) + + return layers end """ @@ -34,21 +35,22 @@ Create a `AlexNet`. See also [`alexnet`](#). !!! warning + `AlexNet` does not currently support pretrained weights. # Arguments -- `pretrain`: set to `true` to load pre-trained weights for ImageNet -- `nclasses`: the number of output classes + + - `pretrain`: set to `true` to load pre-trained weights for ImageNet + - `nclasses`: the number of output classes """ struct AlexNet - layers + layers::Any end function AlexNet(; pretrain = false, nclasses = 1000) - layers = alexnet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "AlexNet") - - AlexNet(layers) + layers = alexnet(; nclasses = nclasses) + pretrain && loadpretrain!(layers, "AlexNet") + return AlexNet(layers) end @functor AlexNet diff --git a/src/convnets/convmixer.jl b/src/convnets/convmixer.jl index 01a6e61be..e19acb2e7 100644 --- a/src/convnets/convmixer.jl +++ b/src/convnets/convmixer.jl @@ -6,30 +6,35 @@ Creates a ConvMixer model. ([reference](https://arxiv.org/abs/2201.09792)) # Arguments -- `planes`: number of planes in the output of each block -- `depth`: number of layers -- `inchannels`: number of channels in the input -- `kernel_size`: kernel size of the convolutional layers -- `patch_size`: size of the patches -- `activation`: activation function used after the convolutional layers -- `nclasses`: number of classes in the output + + - `planes`: number of planes in the output of each block + - `depth`: number of layers + - `inchannels`: number of channels in the input + - `kernel_size`: kernel size of the convolutional layers + - `patch_size`: size of the patches + - `activation`: activation function used after the convolutional layers + - `nclasses`: number of classes in the output """ function convmixer(planes, depth; inchannels = 3, kernel_size = (9, 9), patch_size::Dims{2} = (7, 7), activation = gelu, nclasses = 1000) - stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, stride = patch_size[1]) - blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; - preact = true, groups = planes, pad = SamePad())), +), - conv_bn((1, 1), planes, planes, activation; preact = true)...) for _ in 1:depth] - head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) - return Chain(Chain(stem..., Chain(blocks)), head) + stem = conv_bn(patch_size, inchannels, planes, activation; preact = true, + stride = patch_size[1]) + blocks = [Chain(SkipConnection(Chain(conv_bn(kernel_size, planes, planes, activation; + preact = true, groups = planes, + pad = SamePad())), +), + conv_bn((1, 1), planes, planes, activation; preact = true)...) + for _ in 1:depth] + head = Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(planes, nclasses)) + return Chain(Chain(stem..., Chain(blocks)), head) end convmixer_config = Dict(:base => Dict(:planes => 1536, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7)), + :patch_size => (7, 7)), :small => Dict(:planes => 768, :depth => 32, :kernel_size => (7, 7), - :patch_size => (7, 7)), - :large => Dict(:planes => 1024, :depth => 20, :kernel_size => (9, 9), - :patch_size => (7, 7))) + :patch_size => (7, 7)), + :large => Dict(:planes => 1024, :depth => 20, + :kernel_size => (9, 9), + :patch_size => (7, 7))) """ ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) @@ -38,22 +43,24 @@ Creates a ConvMixer model. ([reference](https://arxiv.org/abs/2201.09792)) # Arguments -- `mode`: the mode of the model, either `:base`, `:small` or `:large` -- `inchannels`: number of channels in the input -- `activation`: activation function used after the convolutional layers -- `nclasses`: number of classes in the output + + - `mode`: the mode of the model, either `:base`, `:small` or `:large` + - `inchannels`: number of channels in the input + - `activation`: activation function used after the convolutional layers + - `nclasses`: number of classes in the output """ struct ConvMixer - layers + layers::Any end function ConvMixer(mode::Symbol = :base; inchannels = 3, activation = gelu, nclasses = 1000) - planes = convmixer_config[mode][:planes] - depth = convmixer_config[mode][:depth] - kernel_size = convmixer_config[mode][:kernel_size] - patch_size = convmixer_config[mode][:patch_size] - layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, nclasses) - return ConvMixer(layers) + planes = convmixer_config[mode][:planes] + depth = convmixer_config[mode][:depth] + kernel_size = convmixer_config[mode][:kernel_size] + patch_size = convmixer_config[mode][:patch_size] + layers = convmixer(planes, depth; inchannels, kernel_size, patch_size, activation, + nclasses) + return ConvMixer(layers) end @functor ConvMixer diff --git a/src/convnets/convnext.jl b/src/convnets/convnext.jl index 1621803bf..31e2d3ac0 100644 --- a/src/convnets/convnext.jl +++ b/src/convnets/convnext.jl @@ -5,19 +5,20 @@ Creates a single block of ConvNeXt. ([reference](https://arxiv.org/abs/2201.03545)) # Arguments: -- `planes`: number of input channels. -- `drop_path_rate`: Stochastic depth rate. -- `λ`: Init value for LayerScale + + - `planes`: number of input channels. + - `drop_path_rate`: Stochastic depth rate. + - `λ`: Init value for LayerScale """ -function convnextblock(planes, drop_path_rate = 0., λ = 1f-6) - layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), - swapdims((3, 1, 2, 4)), - LayerNorm(planes; ϵ = 1f-6), - mlp_block(planes, 4 * planes), - LayerScale(planes, λ), - swapdims((2, 3, 1, 4)), - DropPath(drop_path_rate)), +) - return layers +function convnextblock(planes, drop_path_rate = 0.0, λ = 1.0f-6) + layers = SkipConnection(Chain(DepthwiseConv((7, 7), planes => planes; pad = 3), + swapdims((3, 1, 2, 4)), + LayerNorm(planes; ϵ = 1.0f-6), + mlp_block(planes, 4 * planes), + LayerScale(planes, λ), + swapdims((2, 3, 1, 4)), + DropPath(drop_path_rate)), +) + return layers end """ @@ -27,52 +28,59 @@ Creates the layers for a ConvNeXt model. ([reference](https://arxiv.org/abs/2201.03545)) # Arguments: -- `inchannels`: number of input channels. -- `depths`: list with configuration for depth of each block -- `planes`: list with configuration for number of output channels in each block -- `drop_path_rate`: Stochastic depth rate. -- `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) -- `nclasses`: number of output classes + + - `inchannels`: number of input channels. + - `depths`: list with configuration for depth of each block + - `planes`: list with configuration for number of output channels in each block + - `drop_path_rate`: Stochastic depth rate. + - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) + - `nclasses`: number of output classes """ -function convnext(depths, planes; inchannels = 3, drop_path_rate = 0., λ = 1f-6, nclasses = 1000) - @assert length(depths) == length(planes) "`planes` should have exactly one value for each block" - - downsample_layers = [] - stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), - ChannelLayerNorm(planes[1]; ϵ = 1f-6)) - push!(downsample_layers, stem) - for m in 1:length(depths) - 1 - downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1f-6), - Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) - push!(downsample_layers, downsample_layer) - end - - stages = [] - dp_rates = LinRange{Float32}(0., drop_path_rate, sum(depths)) - cur = 0 - for i in 1:length(depths) - push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) - cur += depths[i] - end - - backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) - head = Chain(GlobalMeanPool(), - MLUtils.flatten, - LayerNorm(planes[end]), - Dense(planes[end], nclasses)) - - return Chain(Chain(backbone), head) +function convnext(depths, planes; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, + nclasses = 1000) + @assert length(depths)==length(planes) "`planes` should have exactly one value for each block" + + downsample_layers = [] + stem = Chain(Conv((4, 4), inchannels => planes[1]; stride = 4), + ChannelLayerNorm(planes[1]; ϵ = 1.0f-6)) + push!(downsample_layers, stem) + for m in 1:(length(depths) - 1) + downsample_layer = Chain(ChannelLayerNorm(planes[m]; ϵ = 1.0f-6), + Conv((2, 2), planes[m] => planes[m + 1]; stride = 2)) + push!(downsample_layers, downsample_layer) + end + + stages = [] + dp_rates = LinRange{Float32}(0.0, drop_path_rate, sum(depths)) + cur = 0 + for i in 1:length(depths) + push!(stages, [convnextblock(planes[i], dp_rates[cur + j], λ) for j in 1:depths[i]]) + cur += depths[i] + end + + backbone = collect(Iterators.flatten(Iterators.flatten(zip(downsample_layers, stages)))) + head = Chain(GlobalMeanPool(), + MLUtils.flatten, + LayerNorm(planes[end]), + Dense(planes[end], nclasses)) + + return Chain(Chain(backbone), head) end # Configurations for ConvNeXt models -convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], :planes => [96, 192, 384, 768]), - :small => Dict(:depths => [3, 3, 27, 3], :planes => [96, 192, 384, 768]), - :base => Dict(:depths => [3, 3, 27, 3], :planes => [128, 256, 512, 1024]), - :large => Dict(:depths => [3, 3, 27, 3], :planes => [192, 384, 768, 1536]), - :xlarge => Dict(:depths => [3, 3, 27, 3], :planes => [256, 512, 1024, 2048])) +convnext_configs = Dict(:tiny => Dict(:depths => [3, 3, 9, 3], + :planes => [96, 192, 384, 768]), + :small => Dict(:depths => [3, 3, 27, 3], + :planes => [96, 192, 384, 768]), + :base => Dict(:depths => [3, 3, 27, 3], + :planes => [128, 256, 512, 1024]), + :large => Dict(:depths => [3, 3, 27, 3], + :planes => [192, 384, 768, 1536]), + :xlarge => Dict(:depths => [3, 3, 27, 3], + :planes => [256, 512, 1024, 2048])) struct ConvNeXt - layers + layers::Any end """ @@ -82,20 +90,21 @@ Creates a ConvNeXt model. ([reference](https://arxiv.org/abs/2201.03545)) # Arguments: -- `inchannels`: number of input channels. -- `drop_path_rate`: Stochastic depth rate. -- `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) -- `nclasses`: number of output classes + + - `inchannels`: number of input channels. + - `drop_path_rate`: Stochastic depth rate. + - `λ`: Init value for [LayerScale](https://arxiv.org/abs/2103.17239) + - `nclasses`: number of output classes See also [`Metalhead.convnext`](#). """ -function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0., λ = 1f-6, +function ConvNeXt(mode::Symbol = :base; inchannels = 3, drop_path_rate = 0.0, λ = 1.0f-6, nclasses = 1000) - @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" - depths = convnext_configs[mode][:depths] - planes = convnext_configs[mode][:planes] - layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) - return ConvNeXt(layers) + @assert mode in keys(convnext_configs) "`size` must be one of $(collect(keys(convnext_configs)))" + depths = convnext_configs[mode][:depths] + planes = convnext_configs[mode][:planes] + layers = convnext(depths, planes; inchannels, drop_path_rate, λ, nclasses) + return ConvNeXt(layers) end (m::ConvNeXt)(x) = m.layers(x) diff --git a/src/convnets/densenet.jl b/src/convnets/densenet.jl index bda7a321d..5384161fd 100644 --- a/src/convnets/densenet.jl +++ b/src/convnets/densenet.jl @@ -5,16 +5,18 @@ Create a Densenet bottleneck layer ([reference](https://arxiv.org/abs/1608.06993)). # Arguments -- `inplanes`: number of input feature maps -- `outplanes`: number of output feature maps on bottleneck branch - (and scaling factor for inner feature maps; see ref) + + - `inplanes`: number of input feature maps + - `outplanes`: number of output feature maps on bottleneck branch + (and scaling factor for inner feature maps; see ref) """ function dense_bottleneck(inplanes, outplanes) - inner_channels = 4 * outplanes - m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., - conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, rev = true)...) + inner_channels = 4 * outplanes + m = Chain(conv_bn((1, 1), inplanes, inner_channels; bias = false, rev = true)..., + conv_bn((3, 3), inner_channels, outplanes; pad = 1, bias = false, + rev = true)...) - SkipConnection(m, cat_channels) + return SkipConnection(m, cat_channels) end """ @@ -24,11 +26,14 @@ Create a DenseNet transition sequence ([reference](https://arxiv.org/abs/1608.06993)). # Arguments -- `inplanes`: number of input feature maps -- `outplanes`: number of output feature maps + + - `inplanes`: number of input feature maps + - `outplanes`: number of output feature maps """ -transition(inplanes, outplanes) = - Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., MeanPool((2, 2))) +function transition(inplanes, outplanes) + return Chain(conv_bn((1, 1), inplanes, outplanes; bias = false, rev = true)..., + MeanPool((2, 2))) +end """ dense_block(inplanes, growth_rates) @@ -38,12 +43,16 @@ the number of output feature maps by `growth_rates` with each block ([reference](https://arxiv.org/abs/1608.06993)). # Arguments -- `inplanes`: number of input feature maps to the full sequence -- `growth_rates`: the growth (additive) rates of output feature maps - after each block (a vector of `k`s from the ref) + + - `inplanes`: number of input feature maps to the full sequence + - `growth_rates`: the growth (additive) rates of output feature maps + after each block (a vector of `k`s from the ref) """ -dense_block(inplanes, growth_rates) = [dense_bottleneck(i, o) - for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), growth_rates)] +function dense_block(inplanes, growth_rates) + return [dense_bottleneck(i, o) + for (i, o) in zip(inplanes .+ cumsum([0, growth_rates[1:(end - 1)]...]), + growth_rates)] +end """ densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) @@ -52,31 +61,32 @@ Create a DenseNet model ([reference](https://arxiv.org/abs/1608.06993)). # Arguments -- `inplanes`: the number of input feature maps to the first dense block -- `growth_rates`: the growth rates of output feature maps within each - [`dense_block`](#) (a vector of vectors) -- `reduction`: the factor by which the number of feature maps is scaled across each transition -- `nclasses`: the number of output classes + + - `inplanes`: the number of input feature maps to the first dense block + - `growth_rates`: the growth rates of output feature maps within each + [`dense_block`](#) (a vector of vectors) + - `reduction`: the factor by which the number of feature maps is scaled across each transition + - `nclasses`: the number of output classes """ function densenet(inplanes, growth_rates; reduction = 0.5, nclasses = 1000) - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) - push!(layers, MaxPool((3, 3), stride = 2, pad = (1, 1))) - - outplanes = 0 - for (i, rates) in enumerate(growth_rates) - outplanes = inplanes + sum(rates) - append!(layers, dense_block(inplanes, rates)) - (i != length(growth_rates)) && - push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) - inplanes = floor(Int, outplanes * reduction) - end - push!(layers, BatchNorm(outplanes, relu)) - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dense(outplanes, nclasses))) + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3), bias = false)) + push!(layers, MaxPool((3, 3); stride = 2, pad = (1, 1))) + + outplanes = 0 + for (i, rates) in enumerate(growth_rates) + outplanes = inplanes + sum(rates) + append!(layers, dense_block(inplanes, rates)) + (i != length(growth_rates)) && + push!(layers, transition(outplanes, floor(Int, outplanes * reduction))) + inplanes = floor(Int, outplanes * reduction) + end + push!(layers, BatchNorm(outplanes, relu)) + + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dense(outplanes, nclasses))) end """ @@ -86,14 +96,16 @@ Create a DenseNet model ([reference](https://arxiv.org/abs/1608.06993)). # Arguments -- `nblocks`: number of dense blocks between transitions -- `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the ref) -- `reduction`: the factor by which the number of feature maps is scaled across each transition -- `nclasses`: the number of output classes + + - `nblocks`: number of dense blocks between transitions + - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the ref) + - `reduction`: the factor by which the number of feature maps is scaled across each transition + - `nclasses`: the number of output classes """ -densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) = - densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; - reduction = reduction, nclasses = nclasses) +function densenet(nblocks; growth_rate = 32, reduction = 0.5, nclasses = 1000) + return densenet(2 * growth_rate, [fill(growth_rate, n) for n in nblocks]; + reduction = reduction, nclasses = nclasses) +end """ DenseNet(nblocks::NTuple{N, <:Integer}; @@ -104,22 +116,23 @@ Create a DenseNet model See also [`densenet`](#). # Arguments -- `nblocks`: number of dense blocks between transitions -- `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the paper) -- `reduction`: the factor by which the number of feature maps is scaled across each transition -- `nclasses`: the number of output classes + + - `nblocks`: number of dense blocks between transitions + - `growth_rate`: the output feature map growth rate of dense blocks (i.e. `k` in the paper) + - `reduction`: the factor by which the number of feature maps is scaled across each transition + - `nclasses`: the number of output classes """ struct DenseNet - layers + layers::Any end function DenseNet(nblocks::NTuple{N, <:Integer}; growth_rate = 32, reduction = 0.5, nclasses = 1000) where {N} - layers = densenet(nblocks; growth_rate = growth_rate, - reduction = reduction, - nclasses = nclasses) + layers = densenet(nblocks; growth_rate = growth_rate, + reduction = reduction, + nclasses = nclasses) - DenseNet(layers) + return DenseNet(layers) end @functor DenseNet @@ -143,16 +156,17 @@ Create a DenseNet model with specified configuration. Currently supported values Set `pretrain = true` to load the model with pre-trained weights for ImageNet. !!! warning + `DenseNet` does not currently support pretrained weights. See also [`Metalhead.densenet`](#). """ function DenseNet(config::Integer = 121; pretrain = false, nclasses = 1000) - @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." - model = DenseNet(densenet_config[config]; nclasses = nclasses) + @assert config in keys(densenet_config) "`config` must be one out of $(sort(collect(keys(densenet_config))))." + model = DenseNet(densenet_config[config]; nclasses = nclasses) - pretrain && loadpretrain!(model, string("DenseNet", config)) - return model + pretrain && loadpretrain!(model, string("DenseNet", config)) + return model end # deprecations diff --git a/src/convnets/googlenet.jl b/src/convnets/googlenet.jl index bc42a052f..318463494 100644 --- a/src/convnets/googlenet.jl +++ b/src/convnets/googlenet.jl @@ -5,13 +5,14 @@ Create an inception module for use in GoogLeNet ([reference](https://arxiv.org/abs/1409.4842v1)). # Arguments -- `inplanes`: the number of input feature maps -- `out_1x1`: the number of output feature maps for the 1x1 convolution (branch 1) -- `red_3x3`: the number of output feature maps for the 3x3 reduction convolution (branch 2) -- `out_3x3`: the number of output feature maps for the 3x3 convolution (branch 2) -- `red_5x5`: the number of output feature maps for the 5x5 reduction convolution (branch 3) -- `out_5x5`: the number of output feature maps for the 5x5 convolution (branch 3) -- `pool_proj`: the number of output feature maps for the pooling projection (branch 4) + + - `inplanes`: the number of input feature maps + - `out_1x1`: the number of output feature maps for the 1x1 convolution (branch 1) + - `red_3x3`: the number of output feature maps for the 3x3 reduction convolution (branch 2) + - `out_3x3`: the number of output feature maps for the 3x3 convolution (branch 2) + - `red_5x5`: the number of output feature maps for the 5x5 reduction convolution (branch 3) + - `out_5x5`: the number of output feature maps for the 5x5 convolution (branch 3) + - `pool_proj`: the number of output feature maps for the pooling projection (branch 4) """ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, pool_proj) branch1 = Chain(Conv((1, 1), inplanes => out_1x1)) @@ -20,9 +21,8 @@ function _inceptionblock(inplanes, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, Conv((3, 3), red_3x3 => out_3x3; pad = 1)) branch3 = Chain(Conv((1, 1), inplanes => red_5x5), - Conv((5, 5), red_5x5 => out_5x5; pad = 2)) - - branch4 = Chain(MaxPool((3, 3), stride=1, pad = 1), + Conv((5, 5), red_5x5 => out_5x5; pad = 2)) + branch4 = Chain(MaxPool((3, 3); stride = 1, pad = 1), Conv((1, 1), inplanes => pool_proj)) return Parallel(cat_channels, @@ -36,31 +36,31 @@ Create an Inception-v1 model (commonly referred to as GoogLeNet) ([reference](https://arxiv.org/abs/1409.4842v1)). # Arguments -- `nclasses`: the number of output classes + + - `nclasses`: the number of output classes """ function googlenet(; nclasses = 1000) - layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), - MaxPool((3, 3), stride = 2, pad = 1), - Conv((1, 1), 64 => 64), - Conv((3, 3), 64 => 192; pad = 1), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(192, 64, 96, 128, 16, 32, 32), - _inceptionblock(256, 128, 128, 192, 32, 96, 64), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(480, 192, 96, 208, 16, 48, 64), - _inceptionblock(512, 160, 112, 224, 24, 64, 64), - _inceptionblock(512, 128, 128, 256, 24, 64, 64), - _inceptionblock(512, 112, 144, 288, 32, 64, 64), - _inceptionblock(528, 256, 160, 320, 32, 128, 128), - MaxPool((3, 3), stride = 2, pad = 1), - _inceptionblock(832, 256, 160, 320, 32, 128, 128), - _inceptionblock(832, 384, 192, 384, 48, 128, 128)), - Chain(AdaptiveMeanPool((1, 1)), - MLUtils.flatten, - Dropout(0.4), - Dense(1024, nclasses))) - - return layers + layers = Chain(Chain(Conv((7, 7), 3 => 64; stride = 2, pad = 3), + MaxPool((3, 3); stride = 2, pad = 1), + Conv((1, 1), 64 => 64), + Conv((3, 3), 64 => 192; pad = 1), + MaxPool((3, 3); stride = 2, pad = 1), + _inceptionblock(192, 64, 96, 128, 16, 32, 32), + _inceptionblock(256, 128, 128, 192, 32, 96, 64), + MaxPool((3, 3); stride = 2, pad = 1), + _inceptionblock(480, 192, 96, 208, 16, 48, 64), + _inceptionblock(512, 160, 112, 224, 24, 64, 64), + _inceptionblock(512, 128, 128, 256, 24, 64, 64), + _inceptionblock(512, 112, 144, 288, 32, 64, 64), + _inceptionblock(528, 256, 160, 320, 32, 128, 128), + MaxPool((3, 3); stride = 2, pad = 1), + _inceptionblock(832, 256, 160, 320, 32, 128, 128), + _inceptionblock(832, 384, 192, 384, 48, 128, 128)), + Chain(AdaptiveMeanPool((1, 1)), + MLUtils.flatten, + Dropout(0.4), + Dense(1024, nclasses))) + return layers end """ @@ -70,23 +70,24 @@ Create an Inception-v1 model (commonly referred to as `GoogLeNet`) ([reference](https://arxiv.org/abs/1409.4842v1)). # Arguments -- `pretrain`: set to `true` to load the model with pre-trained weights for ImageNet -- `nclasses`: the number of output classes + + - `pretrain`: set to `true` to load the model with pre-trained weights for ImageNet + - `nclasses`: the number of output classes !!! warning + `GoogLeNet` does not currently support pretrained weights. See also [`googlenet`](#). """ struct GoogLeNet - layers + layers::Any end function GoogLeNet(; pretrain = false, nclasses = 1000) - layers = googlenet(nclasses = nclasses) - pretrain && loadpretrain!(layers, "GoogLeNet") - - GoogLeNet(layers) + layers = googlenet(; nclasses = nclasses) + pretrain && loadpretrain!(layers, "GoogLeNet") + return GoogLeNet(layers) end @functor GoogLeNet diff --git a/src/convnets/inception.jl b/src/convnets/inception.jl index ef8ab81ef..e81644599 100644 --- a/src/convnets/inception.jl +++ b/src/convnets/inception.jl @@ -5,24 +5,21 @@ Create an Inception-v3 style-A module (ref: Fig. 5 in [paper](https://arxiv.org/abs/1512.00567v3)). # Arguments -- `inplanes`: number of input feature maps -- `pool_proj`: the number of output feature maps for the pooling projection + + - `inplanes`: number of input feature maps + - `pool_proj`: the number of output feature maps for the pooling projection """ function inception_a(inplanes, pool_proj) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) - - branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., - conv_bn((5, 5), 48, 64; pad = 2)...) - - branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; pad = 1)...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, pool_proj)...) - - return Parallel(cat_channels, - branch1x1, branch5x5, branch3x3, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 64)) + branch5x5 = Chain(conv_bn((1, 1), inplanes, 48)..., + conv_bn((5, 5), 48, 64; pad = 2)...) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; pad = 1)...) + branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1), + conv_bn((1, 1), inplanes, pool_proj)...) + return Parallel(cat_channels, + branch1x1, branch5x5, branch3x3, branch_pool) end """ @@ -32,19 +29,17 @@ Create an Inception-v3 style-B module (ref: Fig. 10 in [paper](https://arxiv.org/abs/1512.00567v3)). # Arguments -- `inplanes`: number of input feature maps + + - `inplanes`: number of input feature maps """ function inception_b(inplanes) - branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., - conv_bn((3, 3), 64, 96; pad = 1)..., - conv_bn((3, 3), 96, 96; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride = 2) - - return Parallel(cat_channels, - branch3x3_1, branch3x3_2, branch_pool) + branch3x3_1 = Chain(conv_bn((3, 3), inplanes, 384; stride = 2)) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 64)..., + conv_bn((3, 3), 64, 96; pad = 1)..., + conv_bn((3, 3), 96, 96; stride = 2)...) + branch_pool = MaxPool((3, 3); stride = 2) + return Parallel(cat_channels, + branch3x3_1, branch3x3_2, branch_pool) end """ @@ -54,28 +49,25 @@ Create an Inception-v3 style-C module (ref: Fig. 6 in [paper](https://arxiv.org/abs/1512.00567v3)). # Arguments -- `inplanes`: number of input feature maps -- `inner_planes`: the number of output feature maps within each branch -- `n`: the "grid size" (kernel size) for the convolution layers + + - `inplanes`: number of input feature maps + - `inner_planes`: the number of output feature maps within each branch + - `n`: the "grid size" (kernel size) for the convolution layers """ function inception_c(inplanes, inner_planes, n = 7) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) - - branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) - - branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., - conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., - conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride=1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, branch7x7_1, branch7x7_2, branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 192)) + branch7x7_1 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, 192; pad = (3, 0))...) + branch7x7_2 = Chain(conv_bn((1, 1), inplanes, inner_planes)..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, inner_planes; pad = (0, 3))..., + conv_bn((n, 1), inner_planes, inner_planes; pad = (3, 0))..., + conv_bn((1, n), inner_planes, 192; pad = (0, 3))...) + branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, branch7x7_1, branch7x7_2, branch_pool) end """ @@ -85,21 +77,19 @@ Create an Inception-v3 style-D module (ref: [pytorch](https://github.com/pytorch/vision/blob/6db1569c89094cf23f3bc41f79275c45e9fcb3f3/torchvision/models/inception.py#L322)). # Arguments -- `inplanes`: number of input feature maps + + - `inplanes`: number of input feature maps """ function inception_d(inplanes) - branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((3, 3), 192, 320; stride = 2)...) - - branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., - conv_bn((1, 7), 192, 192; pad = (0, 3))..., - conv_bn((7, 1), 192, 192; pad = (3, 0))..., - conv_bn((3, 3), 192, 192; stride = 2)...) - - branch_pool = MaxPool((3, 3), stride=2) - - return Parallel(cat_channels, - branch3x3, branch7x7x3, branch_pool) + branch3x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((3, 3), 192, 320; stride = 2)...) + branch7x7x3 = Chain(conv_bn((1, 1), inplanes, 192)..., + conv_bn((1, 7), 192, 192; pad = (0, 3))..., + conv_bn((7, 1), 192, 192; pad = (3, 0))..., + conv_bn((3, 3), 192, 192; stride = 2)...) + branch_pool = MaxPool((3, 3); stride = 2) + return Parallel(cat_channels, + branch3x3, branch7x7x3, branch_pool) end """ @@ -109,33 +99,29 @@ Create an Inception-v3 style-E module (ref: Fig. 7 in [paper](https://arxiv.org/abs/1512.00567v3)). # Arguments -- `inplanes`: number of input feature maps + + - `inplanes`: number of input feature maps """ function inception_e(inplanes) - branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) - - branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) - branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., - conv_bn((3, 3), 448, 384; pad = 1)...) - branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) - branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) - - branch_pool = Chain(MeanPool((3, 3), pad = 1, stride = 1), - conv_bn((1, 1), inplanes, 192)...) - - return Parallel(cat_channels, - branch1x1, - Chain(branch3x3_1, - Parallel(cat_channels, - branch3x3_1a, branch3x3_1b)), - - Chain(branch3x3_2, - Parallel(cat_channels, - branch3x3_2a, branch3x3_2b)), - branch_pool) + branch1x1 = Chain(conv_bn((1, 1), inplanes, 320)) + branch3x3_1 = Chain(conv_bn((1, 1), inplanes, 384)) + branch3x3_1a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_1b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch3x3_2 = Chain(conv_bn((1, 1), inplanes, 448)..., + conv_bn((3, 3), 448, 384; pad = 1)...) + branch3x3_2a = Chain(conv_bn((1, 3), 384, 384; pad = (0, 1))) + branch3x3_2b = Chain(conv_bn((3, 1), 384, 384; pad = (1, 0))) + branch_pool = Chain(MeanPool((3, 3); pad = 1, stride = 1), + conv_bn((1, 1), inplanes, 192)...) + return Parallel(cat_channels, + branch1x1, + Chain(branch3x3_1, + Parallel(cat_channels, + branch3x3_1a, branch3x3_1b)), + Chain(branch3x3_2, + Parallel(cat_channels, + branch3x3_2a, branch3x3_2b)), + branch_pool) end """ @@ -144,36 +130,37 @@ end Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)). # Arguments -- `nclasses`: the number of output classes + + - `nclasses`: the number of output classes !!! warning + `inception3` does not currently support pretrained weights. """ function inception3(; nclasses = 1000) - layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., - conv_bn((3, 3), 32, 32)..., - conv_bn((3, 3), 32, 64; pad = 1)..., - MaxPool((3, 3), stride = 2), - conv_bn((1, 1), 64, 80)..., - conv_bn((3, 3), 80, 192)..., - MaxPool((3, 3), stride = 2), - inception_a(192, 32), - inception_a(256, 64), - inception_a(288, 64), - inception_b(288), - inception_c(768, 128), - inception_c(768, 160), - inception_c(768, 160), - inception_c(768, 192), - inception_d(768), - inception_e(1280), - inception_e(2048)), - Chain(AdaptiveMeanPool((1, 1)), - Dropout(0.2), - MLUtils.flatten, - Dense(2048, nclasses))) - - return layer + layer = Chain(Chain(conv_bn((3, 3), 3, 32; stride = 2)..., + conv_bn((3, 3), 32, 32)..., + conv_bn((3, 3), 32, 64; pad = 1)..., + MaxPool((3, 3); stride = 2), + conv_bn((1, 1), 64, 80)..., + conv_bn((3, 3), 80, 192)..., + MaxPool((3, 3); stride = 2), + inception_a(192, 32), + inception_a(256, 64), + inception_a(288, 64), + inception_b(288), + inception_c(768, 128), + inception_c(768, 160), + inception_c(768, 160), + inception_c(768, 192), + inception_d(768), + inception_e(1280), + inception_e(2048)), + Chain(AdaptiveMeanPool((1, 1)), + Dropout(0.2), + MLUtils.flatten, + Dense(2048, nclasses))) + return layer end """ @@ -183,21 +170,22 @@ Create an Inception-v3 model ([reference](https://arxiv.org/abs/1512.00567v3)). See also [`inception3`](#). # Arguments -- `pretrain`: set to `true` to load the pre-trained weights for ImageNet -- `nclasses`: the number of output classes + + - `pretrain`: set to `true` to load the pre-trained weights for ImageNet + - `nclasses`: the number of output classes !!! warning + `Inception3` does not currently support pretrained weights. """ struct Inception3 - layers + layers::Any end function Inception3(; pretrain = false, nclasses = 1000) - layers = inception3(nclasses = nclasses) - pretrain && loadpretrain!(layers, "Inception3") - - Inception3(layers) + layers = inception3(; nclasses = nclasses) + pretrain && loadpretrain!(layers, "Inception3") + return Inception3(layers) end @functor Inception3 diff --git a/src/convnets/mobilenet.jl b/src/convnets/mobilenet.jl index 2dfd06f8d..f2f85a383 100644 --- a/src/convnets/mobilenet.jl +++ b/src/convnets/mobilenet.jl @@ -10,81 +10,86 @@ Create a MobileNetv1 model ([reference](https://arxiv.org/abs/1704.04861v1)). # Arguments -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper) -- `configs`: A "list of tuples" configuration for each layer that details: - - `dw`: Set true to use a depthwise separable convolution or false for regular convolution - - `o`: The number of output feature maps - - `s`: The stride of the convolutional kernel - - `r`: The number of time this configuration block is repeated -- `activate`: The activation function to use throughout the network -- `inchannels`: The number of input feature maps`` -- `fcsize`: The intermediate fully-connected size between the convolution and final layers -- `nclasses`: The number of output classes + + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper) + + - `configs`: A "list of tuples" configuration for each layer that details: + + + `dw`: Set true to use a depthwise separable convolution or false for regular convolution + + `o`: The number of output feature maps + + `s`: The stride of the convolutional kernel + + `r`: The number of time this configuration block is repeated + - `activate`: The activation function to use throughout the network + - `inchannels`: The number of input feature maps`` + - `fcsize`: The intermediate fully-connected size between the convolution and final layers + - `nclasses`: The number of output classes """ function mobilenetv1(width_mult, config; activation = relu, inchannels = 3, nclasses = 1000, fcsize = 1024) - layers = [] - for (dw, outch, stride, nrepeats) in config - outch = Int(outch * width_mult) - for _ in 1:nrepeats - layer = dw ? depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; - stride = stride, pad = 1) : - conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) - append!(layers, layer) - inchannels = outch + layers = [] + for (dw, outch, stride, nrepeats) in config + outch = Int(outch * width_mult) + for _ in 1:nrepeats + layer = dw ? + depthwise_sep_conv_bn((3, 3), inchannels, outch, activation; + stride = stride, pad = 1) : + conv_bn((3, 3), inchannels, outch, activation; stride = stride, pad = 1) + append!(layers, layer) + inchannels = outch + end end - end - return Chain(Chain(layers), - Chain(GlobalMeanPool(), - MLUtils.flatten, - Dense(inchannels, fcsize, activation), - Dense(fcsize, nclasses))) + return Chain(Chain(layers), + Chain(GlobalMeanPool(), + MLUtils.flatten, + Dense(inchannels, fcsize, activation), + Dense(fcsize, nclasses))) end const mobilenetv1_configs = [ -# dw, c, s, r - (false, 32, 2, 1), - ( true, 64, 1, 1), - ( true, 128, 2, 1), - ( true, 128, 1, 1), - ( true, 256, 2, 1), - ( true, 256, 1, 1), - ( true, 512, 2, 1), - ( true, 512, 1, 5), - ( true, 1024, 2, 1), - ( true, 1024, 1, 1) + # dw, c, s, r + (false, 32, 2, 1), + (true, 64, 1, 1), + (true, 128, 2, 1), + (true, 128, 1, 1), + (true, 256, 2, 1), + (true, 256, 1, 1), + (true, 512, 2, 1), + (true, 512, 1, 5), + (true, 1024, 2, 1), + (true, 1024, 1, 1), ] """ MobileNetv1(width_mult = 1; pretrain = false, nclasses = 1000) -Create a MobileNetv1 model with the baseline configuration +Create a MobileNetv1 model with the baseline configuration ([reference](https://arxiv.org/abs/1704.04861v1)). Set `pretrain` to `true` to load the pretrained weights for ImageNet. # Arguments -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper; - this is usually a value between 0.1 and 1.4) -- `pretrain`: Whether to load the pre-trained weights for ImageNet -- `nclasses`: The number of output classes + + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper; + this is usually a value between 0.1 and 1.4) + - `pretrain`: Whether to load the pre-trained weights for ImageNet + - `nclasses`: The number of output classes See also [`Metalhead.mobilenetv1`](#). """ struct MobileNetv1 - layers + layers::Any end function MobileNetv1(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv1")) + layers = mobilenetv1(width_mult, mobilenetv1_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv1")) - return MobileNetv1(layers) + return MobileNetv1(layers) end @functor MobileNetv1 @@ -103,56 +108,60 @@ Create a MobileNetv2 model. ([reference](https://arxiv.org/abs/1801.04381)). # Arguments -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper) -- `configs`: A "list of tuples" configuration for each layer that details: - - `t`: The expansion factor that controls the number of feature maps in the bottleneck layer - - `c`: The number of output feature maps - - `n`: The number of times a block is repeated - - `s`: The stride of the convolutional kernel - - `a`: The activation function used in the bottleneck layer -- `max_width`: The maximum number of feature maps in any layer of the network -- `nclasses`: The number of output classes + + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper) + + - `configs`: A "list of tuples" configuration for each layer that details: + + + `t`: The expansion factor that controls the number of feature maps in the bottleneck layer + + `c`: The number of output feature maps + + `n`: The number of times a block is repeated + + `s`: The stride of the convolutional kernel + + `a`: The activation function used in the bottleneck layer + - `max_width`: The maximum number of feature maps in any layer of the network + - `nclasses`: The number of output classes """ function mobilenetv2(width_mult, configs; max_width = 1280, nclasses = 1000) - # building first layer - inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, stride = 2)) - - # building inverted residual blocks - for (t, c, n, s, a) in configs - outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) - for i in 1:n - push!(layers, invertedresidual(3, inplanes, inplanes * t, outplanes, a; - stride = i == 1 ? s : 1)) - inplanes = outplanes + # building first layer + inplanes = _round_channels(32 * width_mult, width_mult == 0.1 ? 4 : 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes; stride = 2)) + # building inverted residual blocks + for (t, c, n, s, a) in configs + outplanes = _round_channels(c * width_mult, width_mult == 0.1 ? 4 : 8) + for i in 1:n + push!(layers, + invertedresidual(3, inplanes, inplanes * t, outplanes, a; + stride = i == 1 ? s : 1)) + inplanes = outplanes + end end - end - - # building last several layers - outplanes = (width_mult > 1) ? _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : - max_width - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, outplanes, relu6, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(outplanes, nclasses))) + # building last several layers + outplanes = (width_mult > 1) ? + _round_channels(max_width * width_mult, width_mult == 0.1 ? 4 : 8) : + max_width + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, outplanes, relu6; bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(outplanes, nclasses))) end # Layer configurations for MobileNetv2 const mobilenetv2_configs = [ -# t, c, n, s, a - (1, 16, 1, 1, relu6), - (6, 24, 2, 2, relu6), - (6, 32, 3, 2, relu6), - (6, 64, 4, 2, relu6), - (6, 96, 3, 1, relu6), - (6, 160, 3, 2, relu6), - (6, 320, 1, 1, relu6) + # t, c, n, s, a + (1, 16, 1, 1, relu6), + (6, 24, 2, 2, relu6), + (6, 32, 3, 2, relu6), + (6, 64, 4, 2, relu6), + (6, 96, 3, 1, relu6), + (6, 160, 3, 2, relu6), + (6, 320, 1, 1, relu6), ] # Model definition for MobileNetv2 struct MobileNetv2 - layers + layers::Any end """ @@ -160,22 +169,22 @@ end Create a MobileNetv2 model with the specified configuration. ([reference](https://arxiv.org/abs/1801.04381)). -Set `pretrain` to `true` to load the pretrained weights for ImageNet. +Set `pretrain` to `true` to load the pretrained weights for ImageNet. # Arguments -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper; - this is usually a value between 0.1 and 1.4) -- `pretrain`: Whether to load the pre-trained weights for ImageNet -- `nclasses`: The number of output classes + + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper; + this is usually a value between 0.1 and 1.4) + - `pretrain`: Whether to load the pre-trained weights for ImageNet + - `nclasses`: The number of output classes See also [`Metalhead.mobilenetv2`](#). """ function MobileNetv2(width_mult::Number = 1; pretrain = false, nclasses = 1000) - layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv2")) - - MobileNetv2(layers) + layers = mobilenetv2(width_mult, mobilenetv2_configs; nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv2")) + return MobileNetv2(layers) end @functor MobileNetv2 @@ -194,85 +203,87 @@ Create a MobileNetv3 model. ([reference](https://arxiv.org/abs/1905.02244)). # Arguments -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper; - this is usually a value between 0.1 and 1.4) -- `configs`: a "list of tuples" configuration for each layer that details: - - `k::Integer` - The size of the convolutional kernel - - `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer - - `t::Integer` - The number of output feature maps for a given block - - `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers - - `s::Integer` - The stride of the convolutional kernel - - `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`) -- `max_width`: The maximum number of feature maps in any layer of the network -- `nclasses`: the number of output classes + + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper; + this is usually a value between 0.1 and 1.4) + + - `configs`: a "list of tuples" configuration for each layer that details: + + + `k::Integer` - The size of the convolutional kernel + + `c::Float` - The multiplier factor for deciding the number of feature maps in the hidden layer + + `t::Integer` - The number of output feature maps for a given block + + `r::Integer` - The reduction factor (`>= 1` or `nothing` to skip) for squeeze and excite layers + + `s::Integer` - The stride of the convolutional kernel + + `a` - The activation function used in the bottleneck (typically `hardswish` or `relu`) + - `max_width`: The maximum number of feature maps in any layer of the network + - `nclasses`: the number of output classes """ function mobilenetv3(width_mult, configs; max_width = 1024, nclasses = 1000) - # building first layer - inplanes = _round_channels(16 * width_mult, 8) - layers = [] - append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) - explanes = 0 - # building inverted residual blocks - for (k, t, c, r, a, s) in configs - # inverted residual layers - outplanes = _round_channels(c * width_mult, 8) - explanes = _round_channels(inplanes * t, 8) - push!(layers, invertedresidual(k, inplanes, explanes, outplanes, a; - stride = s, reduction = r)) - inplanes = outplanes - end - - # building last several layers - output_channel = max_width - output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : output_channel - classifier = Chain(Dense(explanes, output_channel, hardswish), - Dropout(0.2), - Dense(output_channel, nclasses)) - - return Chain(Chain(Chain(layers), conv_bn((1, 1), inplanes, explanes, hardswish, bias = false)...), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) + # building first layer + inplanes = _round_channels(16 * width_mult, 8) + layers = [] + append!(layers, conv_bn((3, 3), 3, inplanes, hardswish; stride = 2)) + explanes = 0 + # building inverted residual blocks + for (k, t, c, r, a, s) in configs + # inverted residual layers + outplanes = _round_channels(c * width_mult, 8) + explanes = _round_channels(inplanes * t, 8) + push!(layers, + invertedresidual(k, inplanes, explanes, outplanes, a; + stride = s, reduction = r)) + inplanes = outplanes + end + # building last several layers + output_channel = max_width + output_channel = width_mult > 1.0 ? _round_channels(output_channel * width_mult, 8) : + output_channel + classifier = Chain(Dense(explanes, output_channel, hardswish), + Dropout(0.2), + Dense(output_channel, nclasses)) + return Chain(Chain(Chain(layers), + conv_bn((1, 1), inplanes, explanes, hardswish; bias = false)...), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, classifier)) end # Configurations for small and large mode for MobileNetv3 -mobilenetv3_configs = Dict( - :small => [ - # k, t, c, SE, a, s - (3, 1, 16, 4, relu, 2), - (3, 4.5, 24, nothing, relu, 2), - (3, 3.67, 24, nothing, relu, 1), - (5, 4, 40, 4, hardswish, 2), - (5, 6, 40, 4, hardswish, 1), - (5, 6, 40, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 3, 48, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 2), - (5, 6, 96, 4, hardswish, 1), - (5, 6, 96, 4, hardswish, 1), - ], - :large => [ - # k, t, c, SE, a, s - (3, 1, 16, nothing, relu, 1), - (3, 4, 24, nothing, relu, 2), - (3, 3, 24, nothing, relu, 1), - (5, 3, 40, 4, relu, 2), - (5, 3, 40, 4, relu, 1), - (5, 3, 40, 4, relu, 1), - (3, 6, 80, nothing, hardswish, 2), - (3, 2.5, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 2.3, 80, nothing, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (3, 6, 112, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 2), - (5, 6, 160, 4, hardswish, 1), - (5, 6, 160, 4, hardswish, 1) - ] -) +mobilenetv3_configs = Dict(:small => [ + # k, t, c, SE, a, s + (3, 1, 16, 4, relu, 2), + (3, 4.5, 24, nothing, relu, 2), + (3, 3.67, 24, nothing, relu, 1), + (5, 4, 40, 4, hardswish, 2), + (5, 6, 40, 4, hardswish, 1), + (5, 6, 40, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 3, 48, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 2), + (5, 6, 96, 4, hardswish, 1), + (5, 6, 96, 4, hardswish, 1), + ], + :large => [ + # k, t, c, SE, a, s + (3, 1, 16, nothing, relu, 1), + (3, 4, 24, nothing, relu, 2), + (3, 3, 24, nothing, relu, 1), + (5, 3, 40, 4, relu, 2), + (5, 3, 40, 4, relu, 1), + (5, 3, 40, 4, relu, 1), + (3, 6, 80, nothing, hardswish, 2), + (3, 2.5, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 2.3, 80, nothing, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (3, 6, 112, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 2), + (5, 6, 160, 4, hardswish, 1), + (5, 6, 160, 4, hardswish, 1), + ]) # Model definition for MobileNetv3 struct MobileNetv3 - layers + layers::Any end """ @@ -283,22 +294,24 @@ Create a MobileNetv3 model with the specified configuration. Set `pretrain = true` to load the model with pre-trained weights for ImageNet. # Arguments -- `mode`: :small or :large for the size of the model (see paper). -- `width_mult`: Controls the number of output feature maps in each block - (with 1.0 being the default in the paper; - this is usually a value between 0.1 and 1.4) -- `pretrain`: whether to load the pre-trained weights for ImageNet -- `nclasses`: the number of output classes + + - `mode`: :small or :large for the size of the model (see paper). + - `width_mult`: Controls the number of output feature maps in each block + (with 1.0 being the default in the paper; + this is usually a value between 0.1 and 1.4) + - `pretrain`: whether to load the pre-trained weights for ImageNet + - `nclasses`: the number of output classes See also [`Metalhead.mobilenetv3`](#). """ -function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, nclasses = 1000) - @assert mode in [:large, :small] "`mode` has to be either :large or :small" - - max_width = (mode == :large) ? 1280 : 1024 - layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, nclasses = nclasses) - pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) - MobileNetv3(layers) +function MobileNetv3(mode::Symbol = :small, width_mult::Number = 1; pretrain = false, + nclasses = 1000) + @assert mode in [:large, :small] "`mode` has to be either :large or :small" + max_width = (mode == :large) ? 1280 : 1024 + layers = mobilenetv3(width_mult, mobilenetv3_configs[mode]; max_width = max_width, + nclasses = nclasses) + pretrain && loadpretrain!(layers, string("MobileNetv3", mode)) + return MobileNetv3(layers) end @functor MobileNetv3 diff --git a/src/convnets/resnet.jl b/src/convnets/resnet.jl index d91d65d6a..1a84bac68 100644 --- a/src/convnets/resnet.jl +++ b/src/convnets/resnet.jl @@ -5,15 +5,18 @@ Create a basic residual block ([reference](https://arxiv.org/abs/1512.03385v1)). # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: a list of the number of output feature maps for each convolution - within the residual block -- `downsample`: set to `true` to downsample the input + + - `inplanes`: the number of input feature maps + - `outplanes`: a list of the number of output feature maps for each convolution + within the residual block + - `downsample`: set to `true` to downsample the input """ function basicblock(inplanes, outplanes, downsample = false) - stride = downsample ? 2 : 1 - Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, bias = false)...) + stride = downsample ? 2 : 1 + return Chain(conv_bn((3, 3), inplanes, outplanes[1]; stride = stride, pad = 1, + bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2], identity; stride = 1, pad = 1, + bias = false)...) end """ @@ -28,17 +31,21 @@ This version is standard across various ML frameworks. The original paper uses `stride == [2, 1, 1]` when `downsample == true` instead. # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: a list of the number of output feature maps for each convolution - within the residual block -- `downsample`: set to `true` to downsample the input -- `stride`: a list of the stride of the 3 convolutional layers + + - `inplanes`: the number of input feature maps + - `outplanes`: a list of the number of output feature maps for each convolution + within the residual block + - `downsample`: set to `true` to downsample the input + - `stride`: a list of the stride of the 3 convolutional layers """ function bottleneck(inplanes, outplanes, downsample = false; stride = [1, (downsample ? 2 : 1), 1]) - Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], bias = false)..., - conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, bias = false)..., - conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], bias = false)...) + return Chain(conv_bn((1, 1), inplanes, outplanes[1]; stride = stride[1], + bias = false)..., + conv_bn((3, 3), outplanes[1], outplanes[2]; stride = stride[2], pad = 1, + bias = false)..., + conv_bn((1, 1), outplanes[2], outplanes[3], identity; stride = stride[3], + bias = false)...) end """ @@ -50,13 +57,16 @@ Create a bottleneck residual block layer which has a stride of 2. # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: a list of the number of output feature maps for each convolution - within the residual block -- `downsample`: set to `true` to downsample the input + + - `inplanes`: the number of input feature maps + - `outplanes`: a list of the number of output feature maps for each convolution + within the residual block + - `downsample`: set to `true` to downsample the input """ -bottleneck_v1(inplanes, outplanes, downsample = false) = - bottleneck(inplanes, outplanes, downsample; stride = [(downsample ? 2 : 1), 1, 1]) +function bottleneck_v1(inplanes, outplanes, downsample = false) + return bottleneck(inplanes, outplanes, downsample; + stride = [(downsample ? 2 : 1), 1, 1]) +end """ resnet(block, residuals::NTuple{2, Any}, connection = addrelu; @@ -66,43 +76,48 @@ Create a ResNet model ([reference](https://arxiv.org/abs/1512.03385v1)). # Arguments -- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns - a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) -- `residuals`: a 2-tuple of functions with input `(inplanes, outplanes, downsample=false)`, - each of which will return a function that will be used as a new "skip" path to match a residual block. - [`Metalhead.skip_identity`](#) and [`Metalhead.skip_projection`](#) can be used here. -- `connection`: the binary function applied to the output of residual and skip paths in a block -- `channel_config`: the growth rate of the output feature maps within a residual block -- `block_config`: a list of the number of residual blocks at each stage -- `nclasses`: the number of output classes + + - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns + a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) + - `residuals`: a 2-tuple of functions with input `(inplanes, outplanes, downsample=false)`, + each of which will return a function that will be used as a new "skip" path to match a residual block. + [`Metalhead.skip_identity`](#) and [`Metalhead.skip_projection`](#) can be used here. + - `connection`: the binary function applied to the output of residual and skip paths in a block + - `channel_config`: the growth rate of the output feature maps within a residual block + - `block_config`: a list of the number of residual blocks at each stage + - `nclasses`: the number of output classes """ function resnet(block, residuals::AbstractVector{<:NTuple{2, Any}}, connection = addrelu; channel_config, block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 64 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes .* channel_config - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, block(inplanes, outplanes, i != 1), - residuals[i][1](inplanes, outplanes[end], i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes[end] - for _ in 2:nrepeats - push!(layers, Parallel(connection, block(inplanes, outplanes, false), - residuals[i][2](inplanes, outplanes[end], false))) - inplanes = outplanes[end] + inplanes = 64 + baseplanes = 64 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = 3, bias = false)) + push!(layers, MaxPool((3, 3); stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes .* channel_config + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, block(inplanes, outplanes, i != 1), + residuals[i][1](inplanes, outplanes[end], i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes[end] + for _ in 2:nrepeats + push!(layers, + Parallel(connection, block(inplanes, outplanes, false), + residuals[i][2](inplanes, outplanes[end], false))) + inplanes = outplanes[end] + end + # next set of output plane base is doubled + baseplanes *= 2 end # next set of output plane base is doubled baseplanes *= 2 - end - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -113,45 +128,46 @@ Create a ResNet model ([reference](https://arxiv.org/abs/1512.03385v1)). # Arguments -- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns - a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) -- `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`) - - `:A`: uses a [`Metalhead.skip_identity`](#) for all residual blocks - - `:B`: uses a [`Metalhead.skip_projection`](#) for the first residual block - and [`Metalhead.skip_identity`](@) for the remaining residual blocks - - `:C`: uses a [`Metalhead.skip_projection`](#) for all residual blocks -- `connection`: the binary function applied to the output of residual and skip paths in a block -- `channel_config`: the growth rate of the output feature maps within a residual block -- `block_config`: a list of the number of residual blocks at each stage -- `nclasses`: the number of output classes + + - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns + a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) + + - `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`) + + + `:A`: uses a [`Metalhead.skip_identity`](#) for all residual blocks + + `:B`: uses a [`Metalhead.skip_projection`](#) for the first residual block + and [`Metalhead.skip_identity`](@) for the remaining residual blocks + + `:C`: uses a [`Metalhead.skip_projection`](#) for all residual blocks + - `connection`: the binary function applied to the output of residual and skip paths in a block + - `channel_config`: the growth rate of the output feature maps within a residual block + - `block_config`: a list of the number of residual blocks at each stage + - `nclasses`: the number of output classes """ function resnet(block, shortcut_config::AbstractVector{<:Symbol}, args...; kwargs...) - shortcut_dict = Dict( - :A => (skip_identity, skip_identity), - :B => (skip_projection, skip_identity), - :C => (skip_projection, skip_projection)) - - if any(sc -> !haskey(shortcut_dict,sc),shortcut_config) - error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") - end - - shortcut = [shortcut_dict[sc] for sc in shortcut_config] - resnet(block, shortcut, args...; kwargs...) + shortcut_dict = Dict(:A => (skip_identity, skip_identity), + :B => (skip_projection, skip_identity), + :C => (skip_projection, skip_projection)) + if any(sc -> !haskey(shortcut_dict, sc), shortcut_config) + error("Unrecognized shortcut_config ($shortcut_config) passed to `resnet` (use only :A, :B, or :C).") + end + shortcut = [shortcut_dict[sc] for sc in shortcut_config] + return resnet(block, shortcut, args...; kwargs...) end function resnet(block, shortcut_config::Symbol, args...; block_config, kwargs...) - resnet(block, fill(shortcut_config, length(block_config)), args...; - block_config = block_config, kwargs...) + return resnet(block, fill(shortcut_config, length(block_config)), args...; + block_config = block_config, kwargs...) end -resnet(block, residuals::NTuple{2}, args...; kwargs...) = resnet(block, [residuals], args...; kwargs...) +function resnet(block, residuals::NTuple{2}, args...; kwargs...) + return resnet(block, [residuals], args...; kwargs...) +end -const resnet_config = - Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), - 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), - 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), - 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), - 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) +const resnet_config = Dict(18 => (([1, 1], [2, 2, 2, 2], [:A, :B, :B, :B]), basicblock), + 34 => (([1, 1], [3, 4, 6, 3], [:A, :B, :B, :B]), basicblock), + 50 => (([1, 1, 4], [3, 4, 6, 3], [:B, :B, :B, :B]), bottleneck), + 101 => (([1, 1, 4], [3, 4, 23, 3], [:B, :B, :B, :B]), bottleneck), + 152 => (([1, 1, 4], [3, 8, 36, 3], [:B, :B, :B, :B]), bottleneck)) """ ResNet(channel_config, block_config, shortcut_config; @@ -162,30 +178,30 @@ Create a `ResNet` model See also [`resnet`](#). # Arguments -- `channel_config`: the growth rate of the output feature maps within a residual block -- `block_config`: a list of the number of residual blocks at each stage -- `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`). - `shortcut_config` can also be a vector of symbols if different shortcut styles are applied to - different residual blocks. -- `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns - a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) -- `connection`: the binary function applied to the output of residual and skip paths in a block -- `nclasses`: the number of output classes + + - `channel_config`: the growth rate of the output feature maps within a residual block + - `block_config`: a list of the number of residual blocks at each stage + - `shortcut_config`: the type of shortcut style (either `:A`, `:B`, or `:C`). + `shortcut_config` can also be a vector of symbols if different shortcut styles are applied to + different residual blocks. + - `block`: a function with input `(inplanes, outplanes, downsample=false)` that returns + a new residual block (see [`Metalhead.basicblock`](#) and [`Metalhead.bottleneck`](#)) + - `connection`: the binary function applied to the output of residual and skip paths in a block + - `nclasses`: the number of output classes """ struct ResNet - layers + layers::Any end function ResNet(channel_config, block_config, shortcut_config; block, connection = addrelu, nclasses = 1000) - layers = resnet(block, - shortcut_config, - connection; - channel_config = channel_config, - block_config = block_config, - nclasses = nclasses) - - ResNet(layers) + layers = resnet(block, + shortcut_config, + connection; + channel_config = channel_config, + block_config = block_config, + nclasses = nclasses) + return ResNet(layers) end @functor ResNet @@ -206,10 +222,12 @@ referred as ResNet v1.5. See also [`Metalhead.resnet`](#). # Arguments -- `depth`: depth of the ResNet model. Options include (18, 34, 50, 101, 152). -- `nclasses`: the number of output classes + + - `depth`: depth of the ResNet model. Options include (18, 34, 50, 101, 152). + - `nclasses`: the number of output classes !!! warning + Only `ResNet(50)` currently supports pretrained weights. For `ResNet(18)` and `ResNet(34)`, the parameter-free shortcut style (type `:A`) @@ -242,7 +260,7 @@ function ResNet(depth::Integer = 50; pretrain = false, nclasses = 1000) config, block = resnet_config[depth] model = ResNet(config...; block = block, nclasses = nclasses) pretrain && loadpretrain!(model, string("ResNet", depth)) - model + return model end # Compat with Metalhead 0.6; remove in 0.7 diff --git a/src/convnets/resnext.jl b/src/convnets/resnext.jl index eaa66f98f..b8ed03bb0 100644 --- a/src/convnets/resnext.jl +++ b/src/convnets/resnext.jl @@ -5,19 +5,20 @@ Create a basic residual block as defined in the paper for ResNeXt ([reference](https://arxiv.org/abs/1611.05431)). # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: the number of output feature maps -- `cardinality`: the number of groups to use for the convolution -- `width`: the number of feature maps in each group in the bottleneck -- `downsample`: set to `true` to downsample the input + + - `inplanes`: the number of input feature maps + - `outplanes`: the number of output feature maps + - `cardinality`: the number of groups to use for the convolution + - `width`: the number of feature maps in each group in the bottleneck + - `downsample`: set to `true` to downsample the input """ function resnextblock(inplanes, outplanes, cardinality, width, downsample = false) - stride = downsample ? 2 : 1 - hidden_channels = cardinality * width - return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., - conv_bn((3, 3), hidden_channels, hidden_channels; - stride = stride, pad = 1, bias = false, groups = cardinality)..., - conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) + stride = downsample ? 2 : 1 + hidden_channels = cardinality * width + return Chain(conv_bn((1, 1), inplanes, hidden_channels; stride = 1, bias = false)..., + conv_bn((3, 3), hidden_channels, hidden_channels; + stride = stride, pad = 1, bias = false, groups = cardinality)..., + conv_bn((1, 1), hidden_channels, outplanes; stride = 1, bias = false)...) end """ @@ -28,40 +29,46 @@ Create a ResNeXt model ([reference](https://arxiv.org/abs/1611.05431)). # Arguments -- `cardinality`: the number of groups to use for the convolution -- `width`: the number of feature maps in each group in the bottleneck -- `widen_factor`: the factor by which the width of the bottleneck is increased after each stage -- `connection`: the binary function applied to the output of residual and skip paths in a block -- `block_config`: a list of the number of residual blocks at each stage -- `nclasses`: the number of output classes + + - `cardinality`: the number of groups to use for the convolution + - `width`: the number of feature maps in each group in the bottleneck + - `widen_factor`: the factor by which the width of the bottleneck is increased after each stage + - `connection`: the binary function applied to the output of residual and skip paths in a block + - `block_config`: a list of the number of residual blocks at each stage + - `nclasses`: the number of output classes """ -function resnext(cardinality, width, widen_factor = 2, connection = (x, y) -> @. relu(x) + relu(y); +function resnext(cardinality, width, widen_factor = 2, + connection = (x, y) -> @. relu(x) + relu(y); block_config, nclasses = 1000) - inplanes = 64 - baseplanes = 128 - layers = [] - append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) - push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1))) - for (i, nrepeats) in enumerate(block_config) - # output planes within a block - outplanes = baseplanes * widen_factor - # push first skip connection on using first residual - # downsample the residual path if this is the first repetition of a block - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, i != 1), - skip_projection(inplanes, outplanes, i != 1))) - # push remaining skip connections on using second residual - inplanes = outplanes - for _ in 2:nrepeats - push!(layers, Parallel(connection, resnextblock(inplanes, outplanes, cardinality, width, false), - skip_identity(inplanes, outplanes, false))) + inplanes = 64 + baseplanes = 128 + layers = [] + append!(layers, conv_bn((7, 7), 3, inplanes; stride = 2, pad = (3, 3))) + push!(layers, MaxPool((3, 3); stride = (2, 2), pad = (1, 1))) + for (i, nrepeats) in enumerate(block_config) + # output planes within a block + outplanes = baseplanes * widen_factor + # push first skip connection on using first residual + # downsample the residual path if this is the first repetition of a block + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, i != 1), + skip_projection(inplanes, outplanes, i != 1))) + # push remaining skip connections on using second residual + inplanes = outplanes + for _ in 2:nrepeats + push!(layers, + Parallel(connection, + resnextblock(inplanes, outplanes, cardinality, width, false), + skip_identity(inplanes, outplanes, false))) + end + baseplanes = outplanes + # double width after every cluster of blocks + width *= widen_factor end - baseplanes = outplanes - # double width after every cluster of blocks - width *= widen_factor - end - - return Chain(Chain(layers), - Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, Dense(inplanes, nclasses))) + return Chain(Chain(layers), + Chain(AdaptiveMeanPool((1, 1)), MLUtils.flatten, + Dense(inplanes, nclasses))) end """ @@ -71,18 +78,19 @@ Create a ResNeXt model ([reference](https://arxiv.org/abs/1611.05431)). # Arguments -- `cardinality`: the number of groups to use for the convolution -- `width`: the number of feature maps in each group in the bottleneck -- `block_config`: a list of the number of residual blocks at each stage -- `nclasses`: the number of output classes + + - `cardinality`: the number of groups to use for the convolution + - `width`: the number of feature maps in each group in the bottleneck + - `block_config`: a list of the number of residual blocks at each stage + - `nclasses`: the number of output classes """ struct ResNeXt - layers + layers::Any end function ResNeXt(cardinality, width; block_config, nclasses = 1000) - layers = resnext(cardinality, width; block_config, nclasses) - ResNeXt(layers) + layers = resnext(cardinality, width; block_config, nclasses) + return ResNeXt(layers) end @functor ResNeXt @@ -92,11 +100,9 @@ end backbone(m::ResNeXt) = m.layers[1] classifier(m::ResNeXt) = m.layers[2] -const resnext_config = Dict( - 50 => (3, 4, 6, 3), - 101 => (3, 4, 23, 3), - 152 => (3, 8, 36, 3) -) +const resnext_config = Dict(50 => (3, 4, 6, 3), + 101 => (3, 4, 23, 3), + 152 => (3, 8, 36, 3)) """ ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) @@ -106,14 +112,16 @@ Create a ResNeXt model with specified configuration. Currently supported values Set `pretrain = true` to load the model with pre-trained weights for ImageNet. !!! warning - `ResNeXt` does not currently support pretrained weights. + + +`ResNeXt` does not currently support pretrained weights. See also [`Metalhead.resnext`](#). """ -function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, nclasses = 1000) - @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" - - model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) - pretrain && loadpretrain!(model, string("ResNeXt", config)) - model -end \ No newline at end of file +function ResNeXt(config::Integer = 50; cardinality = 32, width = 4, pretrain = false, + nclasses = 1000) + @assert config in keys(resnext_config) "`config` must be one of $(sort(collect(keys(resnext_config))))" + model = ResNeXt(cardinality, width; block_config = resnext_config[config], nclasses) + pretrain && loadpretrain!(model, string("ResNeXt", config)) + return model +end diff --git a/src/convnets/squeezenet.jl b/src/convnets/squeezenet.jl index 169ad2e86..c4de36acc 100644 --- a/src/convnets/squeezenet.jl +++ b/src/convnets/squeezenet.jl @@ -5,20 +5,21 @@ Create a fire module ([reference](https://arxiv.org/abs/1602.07360v4)). # Arguments -- `inplanes`: number of input feature maps -- `squeeze_planes`: number of intermediate feature maps -- `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution -- `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution + + - `inplanes`: number of input feature maps + - `squeeze_planes`: number of intermediate feature maps + - `expand1x1_planes`: number of output feature maps for the 1x1 expansion convolution + - `expand3x3_planes`: number of output feature maps for the 3x3 expansion convolution """ function fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes) - branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) - branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) - branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, pad = 1, relu) + branch_1 = Conv((1, 1), inplanes => squeeze_planes, relu) + branch_2 = Conv((1, 1), squeeze_planes => expand1x1_planes, relu) + branch_3 = Conv((3, 3), squeeze_planes => expand3x3_planes, relu; pad = 1) - return Chain(branch_1, - Parallel(cat_channels, - branch_2, - branch_3)) + return Chain(branch_1, + Parallel(cat_channels, + branch_2, + branch_3)) end """ @@ -28,24 +29,24 @@ Create a SqueezeNet ([reference](https://arxiv.org/abs/1602.07360v4)). """ function squeezenet() - layers = Chain(Chain(Conv((3, 3), 3 => 64, relu, stride = 2), - MaxPool((3, 3), stride = 2), - fire(64, 16, 64, 64), - fire(128, 16, 64, 64), - MaxPool((3, 3), stride = 2), - fire(128, 32, 128, 128), - fire(256, 32, 128, 128), - MaxPool((3, 3), stride = 2), - fire(256, 48, 192, 192), - fire(384, 48, 192, 192), - fire(384, 64, 256, 256), - fire(512, 64, 256, 256), - Dropout(0.5), - Conv((1, 1), 512 => 1000, relu)), - AdaptiveMeanPool((1, 1)), - MLUtils.flatten) + layers = Chain(Chain(Conv((3, 3), 3 => 64, relu; stride = 2), + MaxPool((3, 3); stride = 2), + fire(64, 16, 64, 64), + fire(128, 16, 64, 64), + MaxPool((3, 3); stride = 2), + fire(128, 32, 128, 128), + fire(256, 32, 128, 128), + MaxPool((3, 3); stride = 2), + fire(256, 48, 192, 192), + fire(384, 48, 192, 192), + fire(384, 64, 256, 256), + fire(512, 64, 256, 256), + Dropout(0.5), + Conv((1, 1), 512 => 1000, relu)), + AdaptiveMeanPool((1, 1)), + MLUtils.flatten) - return layers + return layers end """ @@ -56,19 +57,19 @@ Create a SqueezeNet Set `pretrain=true` to load the model with pre-trained weights for ImageNet. !!! warning + `SqueezeNet` does not currently support pretrained weights. See also [`squeezenet`](#). """ struct SqueezeNet - layers + layers::Any end function SqueezeNet(; pretrain = false) - layers = squeezenet() - pretrain && loadpretrain!(layers, "SqueezeNet") - - SqueezeNet(layers) + layers = squeezenet() + pretrain && loadpretrain!(layers, "SqueezeNet") + return SqueezeNet(layers) end @functor SqueezeNet diff --git a/src/convnets/vgg.jl b/src/convnets/vgg.jl index bdca0d9ee..423bcdf53 100644 --- a/src/convnets/vgg.jl +++ b/src/convnets/vgg.jl @@ -5,24 +5,26 @@ A VGG block of convolution layers ([reference](https://arxiv.org/abs/1409.1556v6)). # Arguments -- `ifilters`: number of input feature maps -- `ofilters`: number of output feature maps -- `depth`: number of convolution/convolution + batch norm layers -- `batchnorm`: set to `true` to include batch normalization after each convolution + + - `ifilters`: number of input feature maps + - `ofilters`: number of output feature maps + - `depth`: number of convolution/convolution + batch norm layers + - `batchnorm`: set to `true` to include batch normalization after each convolution """ function vgg_block(ifilters, ofilters, depth, batchnorm) - k = (3,3) - p = (1,1) - layers = [] - for _ in 1:depth - if batchnorm - append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) - else - push!(layers, Conv(k, ifilters => ofilters, relu, pad = p)) + k = (3, 3) + p = (1, 1) + layers = [] + for _ in 1:depth + if batchnorm + append!(layers, conv_bn(k, ifilters, ofilters; pad = p, bias = false)) + else + push!(layers, Conv(k, ifilters => ofilters, relu; pad = p)) + end + ifilters = ofilters end ifilters = ofilters - end - return layers + return layers end """ @@ -32,20 +34,21 @@ Create VGG convolution layers ([reference](https://arxiv.org/abs/1409.1556v6)). # Arguments -- `config`: vector of tuples `(output_channels, num_convolutions)` - for each block (see [`Metalhead.vgg_block`](#)) -- `batchnorm`: set to `true` to include batch normalization after each convolution -- `inchannels`: number of input channels + + - `config`: vector of tuples `(output_channels, num_convolutions)` + for each block (see [`Metalhead.vgg_block`](#)) + - `batchnorm`: set to `true` to include batch normalization after each convolution + - `inchannels`: number of input channels """ function vgg_convolutional_layers(config, batchnorm, inchannels) - layers = [] - ifilters = inchannels - for c in config - append!(layers, vgg_block(ifilters, c..., batchnorm)) - push!(layers, MaxPool((2,2), stride=2)) - ifilters, _ = c - end - return layers + layers = [] + ifilters = inchannels + for c in config + append!(layers, vgg_block(ifilters, c..., batchnorm)) + push!(layers, MaxPool((2, 2); stride = 2)) + ifilters, _ = c + end + return layers end """ @@ -55,19 +58,20 @@ Create VGG classifier (fully connected) layers ([reference](https://arxiv.org/abs/1409.1556v6)). # Arguments -- `imsize`: tuple `(width, height, channels)` indicating the size after - the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#)) -- `nclasses`: number of output classes -- `fcsize`: input and output size of the intermediate fully connected layer -- `dropout`: the dropout level between each fully connected layer + + - `imsize`: tuple `(width, height, channels)` indicating the size after + the convolution layers (see [`Metalhead.vgg_convolutional_layers`](#)) + - `nclasses`: number of output classes + - `fcsize`: input and output size of the intermediate fully connected layer + - `dropout`: the dropout level between each fully connected layer """ function vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(MLUtils.flatten, - Dense(Int(prod(imsize)), fcsize, relu), - Dropout(dropout), - Dense(fcsize, fcsize, relu), - Dropout(dropout), - Dense(fcsize, nclasses)) + return Chain(MLUtils.flatten, + Dense(Int(prod(imsize)), fcsize, relu), + Dropout(dropout), + Dense(fcsize, fcsize, relu), + Dropout(dropout), + Dense(fcsize, nclasses)) end """ @@ -77,27 +81,28 @@ Create a VGG model ([reference](https://arxiv.org/abs/1409.1556v6)). # Arguments -- `imsize`: input image width and height as a tuple -- `config`: the configuration for the convolution layers - (see [`Metalhead.vgg_convolutional_layers`](#)) -- `inchannels`: number of input channels -- `batchnorm`: set to `true` to use batch normalization after each convolution -- `nclasses`: number of output classes -- `fcsize`: intermediate fully connected layer size - (see [`Metalhead.vgg_classifier_layers`](#)) -- `dropout`: dropout level between fully connected layers + + - `imsize`: input image width and height as a tuple + - `config`: the configuration for the convolution layers + (see [`Metalhead.vgg_convolutional_layers`](#)) + - `inchannels`: number of input channels + - `batchnorm`: set to `true` to use batch normalization after each convolution + - `nclasses`: number of output classes + - `fcsize`: intermediate fully connected layer size + (see [`Metalhead.vgg_classifier_layers`](#)) + - `dropout`: dropout level between fully connected layers """ function vgg(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - conv = vgg_convolutional_layers(config, batchnorm, inchannels) - imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] - class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) - return Chain(Chain(conv), class) + conv = vgg_convolutional_layers(config, batchnorm, inchannels) + imsize = outputsize(conv, (imsize..., inchannels); padbatch = true)[1:3] + class = vgg_classifier_layers(imsize, nclasses, fcsize, dropout) + return Chain(Chain(conv), class) end -const vgg_conv_config = Dict(:A => [(64,1), (128,1), (256,2), (512,2), (512,2)], - :B => [(64,2), (128,2), (256,2), (512,2), (512,2)], - :D => [(64,2), (128,2), (256,3), (512,3), (512,3)], - :E => [(64,2), (128,2), (256,4), (512,4), (512,4)]) +const vgg_conv_config = Dict(:A => [(64, 1), (128, 1), (256, 2), (512, 2), (512, 2)], + :B => [(64, 2), (128, 2), (256, 2), (512, 2), (512, 2)], + :D => [(64, 2), (128, 2), (256, 3), (512, 3), (512, 3)], + :E => [(64, 2), (128, 2), (256, 4), (512, 4), (512, 4)]) const vgg_config = Dict(11 => :A, 13 => :B, @@ -105,7 +110,7 @@ const vgg_config = Dict(11 => :A, 19 => :E) struct VGG - layers + layers::Any end """ @@ -114,24 +119,25 @@ end Construct a VGG model with the specified input image size. Typically, the image size is `(224, 224)`. ## Keyword Arguments: -- `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block -- `inchannels`::Integer : number of input channels -- `batchnorm`::Bool : set to `true` to use batch normalization after each convolution -- `nclasses`::Integer : number of output classes -- `fcsize`: intermediate fully connected layer size - (see [`Metalhead.vgg_classifier_layers`](#)) -- `dropout`: dropout level between fully connected layers + + - `config` : VGG convolutional block configuration. It is defined as a vector of tuples `(output_channels, num_convolutions)` for each block + - `inchannels`::Integer : number of input channels + - `batchnorm`::Bool : set to `true` to use batch normalization after each convolution + - `nclasses`::Integer : number of output classes + - `fcsize`: intermediate fully connected layer size + (see [`Metalhead.vgg_classifier_layers`](#)) + - `dropout`: dropout level between fully connected layers """ function VGG(imsize::Dims{2}; config, inchannels, batchnorm = false, nclasses, fcsize, dropout) - layers = vgg(imsize; config = config, - inchannels = inchannels, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = fcsize, - dropout = dropout) - - VGG(layers) + layers = vgg(imsize; config = config, + inchannels = inchannels, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = fcsize, + dropout = dropout) + + return VGG(layers) end @functor VGG @@ -149,27 +155,27 @@ Create a VGG style model with specified `depth`. Available values include (11, 1 See also [`VGG`](#). !!! warning + `VGG` does not currently support pretrained weights. # Arguments -- `pretrain`: set to `true` to load pre-trained model weights for ImageNet + + - `pretrain`: set to `true` to load pre-trained model weights for ImageNet """ function VGG(depth::Integer = 16; pretrain = false, batchnorm = false, nclasses = 1000) - @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" - - model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], - inchannels = 3, - batchnorm = batchnorm, - nclasses = nclasses, - fcsize = 4096, - dropout = 0.5) - - if pretrain && !batchnorm - loadpretrain!(model, string("VGG", depth)) - elseif pretrain - loadpretrain!(model, "VGG$(depth)-BN)") - end - model + @assert depth in keys(vgg_config) "depth must be from one in $(sort(collect(keys(vgg_config))))" + model = VGG((224, 224); config = vgg_conv_config[vgg_config[depth]], + inchannels = 3, + batchnorm = batchnorm, + nclasses = nclasses, + fcsize = 4096, + dropout = 0.5) + if pretrain && !batchnorm + loadpretrain!(model, string("VGG", depth)) + elseif pretrain + loadpretrain!(model, "VGG$(depth)-BN)") + end + return model end # deprecations diff --git a/src/layers/attention.jl b/src/layers/attention.jl index 10baf73e9..3d63ddad0 100644 --- a/src/layers/attention.jl +++ b/src/layers/attention.jl @@ -4,16 +4,17 @@ Multi-head self-attention layer. # Arguments: -- `nheads`: Number of heads -- `qkv_layer`: layer to be used for getting the query, key and value -- `attn_drop`: dropout rate after the self-attention layer -- `projection`: projection layer to be used after self-attention + + - `nheads`: Number of heads + - `qkv_layer`: layer to be used for getting the query, key and value + - `attn_drop`: dropout rate after the self-attention layer + - `projection`: projection layer to be used after self-attention """ struct MHAttention{P, Q, R} - nheads::Int - qkv_layer::P - attn_drop::Q - projection::R + nheads::Int + qkv_layer::P + attn_drop::Q + projection::R end """ @@ -22,37 +23,38 @@ end Multi-head self-attention layer. # Arguments: -- `planes`: number of input channels -- `nheads`: number of heads -- `qkv_bias`: whether to use bias in the layer to get the query, key and value -- `attn_drop`: dropout rate after the self-attention layer -- `proj_drop`: dropout rate after the projection layer -""" -function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, attn_drop = 0., proj_drop = 0.) - @assert planes % nheads == 0 "planes should be divisible by nheads" - qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) - attn_drop = Dropout(attn_drop) - proj = Chain(Dense(planes, planes), Dropout(proj_drop)) - MHAttention(nheads, qkv_layer, attn_drop, proj) + - `planes`: number of input channels + - `nheads`: number of heads + - `qkv_bias`: whether to use bias in the layer to get the query, key and value + - `attn_drop`: dropout rate after the self-attention layer + - `proj_drop`: dropout rate after the projection layer +""" +function MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, + attn_drop = 0.0, proj_drop = 0.0) + @assert planes % nheads==0 "planes should be divisible by nheads" + qkv_layer = Dense(planes, planes * 3; bias = qkv_bias) + attn_drop = Dropout(attn_drop) + proj = Chain(Dense(planes, planes), Dropout(proj_drop)) + return MHAttention(nheads, qkv_layer, attn_drop, proj) end @functor MHAttention function (m::MHAttention)(x::AbstractArray{T, 3}) where {T} - nfeatures, seq_len, batch_size = size(x) - x_reshaped = reshape(x, nfeatures, seq_len * batch_size) - qkv = m.qkv_layer(x_reshaped) - qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) - query, key, value = chunk(qkv_reshaped, 3; dims = 4) - scale = convert(T, sqrt(size(query, 1) / m.nheads)) - key_reshaped = reshape( - permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, seq_len * batch_size - ) - query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) - value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) - pre_projection = reshape(batched_mul(attention, value_reshaped), (nfeatures, seq_len, batch_size)) - y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) - return reshape(y, :, seq_len, batch_size) + nfeatures, seq_len, batch_size = size(x) + x_reshaped = reshape(x, nfeatures, seq_len * batch_size) + qkv = m.qkv_layer(x_reshaped) + qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size) + query, key, value = chunk(qkv_reshaped, 3; dims = 4) + scale = convert(T, sqrt(size(query, 1) / m.nheads)) + key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads, + seq_len * batch_size) + query_reshaped = reshape(query, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + attention = m.attn_drop(softmax(batched_mul(query_reshaped, key_reshaped) .* scale)) + value_reshaped = reshape(value, nfeatures ÷ m.nheads, m.nheads, seq_len * batch_size) + pre_projection = reshape(batched_mul(attention, value_reshaped), + (nfeatures, seq_len, batch_size)) + y = m.projection(reshape(pre_projection, size(pre_projection, 1), :)) + return reshape(y, :, seq_len, batch_size) end diff --git a/src/layers/conv.jl b/src/layers/conv.jl index ca30df8a4..d9b631bb0 100644 --- a/src/layers/conv.jl +++ b/src/layers/conv.jl @@ -7,45 +7,44 @@ Create a convolution + batch normalization pair with activation. # Arguments -- `kernelsize`: size of the convolution kernel (tuple) -- `inplanes`: number of input feature maps -- `outplanes`: number of output feature maps -- `activation`: the activation function for the final layer -- `rev`: set to `true` to place the batch norm before the convolution -- `preact`: set to `true` to place the activation function before the batch norm - (only compatible with `rev = false`) -- `stride`: stride of the convolution kernel -- `pad`: padding of the convolution kernel -- `dilation`: dilation of the convolution kernel -- `groups`: groups for the convolution kernel -- `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#)) -- `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) -- `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) + + - `kernelsize`: size of the convolution kernel (tuple) + - `inplanes`: number of input feature maps + - `outplanes`: number of output feature maps + - `activation`: the activation function for the final layer + - `rev`: set to `true` to place the batch norm before the convolution + - `preact`: set to `true` to place the activation function before the batch norm + (only compatible with `rev = false`) + - `stride`: stride of the convolution kernel + - `pad`: padding of the convolution kernel + - `dilation`: dilation of the convolution kernel + - `groups`: groups for the convolution kernel + - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#)) + - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) + - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) """ function conv_bn(kernelsize, inplanes, outplanes, activation = relu; rev = false, preact = false, - initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1f-5, momentum = 1f-1, + initβ = Flux.zeros32, initγ = Flux.ones32, ϵ = 1.0f-5, momentum = 1.0f-1, kwargs...) - layers = [] - - if rev - activations = (conv = activation, bn = identity) - bnplanes = inplanes - else - activations = (conv = identity, bn = activation) - bnplanes = outplanes - end - - if preact - rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : - activations = (conv = activation, bn = identity) - end - - push!(layers, Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) - push!(layers, BatchNorm(Int(bnplanes), activations.bn; - initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) - - return rev ? reverse(layers) : layers + layers = [] + if rev + activations = (conv = activation, bn = identity) + bnplanes = inplanes + else + activations = (conv = identity, bn = activation) + bnplanes = outplanes + end + if preact + rev ? throw(ArgumentError("preact and rev cannot be set at the same time")) : + activations = (conv = activation, bn = identity) + end + push!(layers, + Conv(kernelsize, Int(inplanes) => Int(outplanes), activations.conv; kwargs...)) + push!(layers, + BatchNorm(Int(bnplanes), activations.bn; + initβ = initβ, initγ = initγ, ϵ = ϵ, momentum = momentum)) + return rev ? reverse(layers) : layers end """ @@ -57,38 +56,41 @@ end Create a depthwise separable convolution chain as used in MobileNet v1. This is sequence of layers: -- a `kernelsize` depthwise convolution from `inplanes => inplanes` -- a batch norm layer + `activation` -- a `kernelsize` convolution from `inplanes => outplanes` -- a batch norm layer + `activation` + + - a `kernelsize` depthwise convolution from `inplanes => inplanes` + - a batch norm layer + `activation` + - a `kernelsize` convolution from `inplanes => outplanes` + - a batch norm layer + `activation` See Fig. 3 in [reference](https://arxiv.org/abs/1704.04861v1). # Arguments -- `kernelsize`: size of the convolution kernel (tuple) -- `inplanes`: number of input feature maps -- `outplanes`: number of output feature maps -- `activation`: the activation function for the final layer -- `rev`: set to `true` to place the batch norm before the convolution -- `stride`: stride of the first convolution kernel -- `pad`: padding of the first convolution kernel -- `dilation`: dilation of the first convolution kernel -- `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#)) -- `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) -- `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) + + - `kernelsize`: size of the convolution kernel (tuple) + - `inplanes`: number of input feature maps + - `outplanes`: number of output feature maps + - `activation`: the activation function for the final layer + - `rev`: set to `true` to place the batch norm before the convolution + - `stride`: stride of the first convolution kernel + - `pad`: padding of the first convolution kernel + - `dilation`: dilation of the first convolution kernel + - `bias`, `weight`, `init`: initialization for the convolution kernel (see [`Flux.Conv`](#)) + - `initβ`, `initγ`: initialization for the batch norm (see [`Flux.BatchNorm`](#)) + - `ϵ`, `momentum`: batch norm parameters (see [`Flux.BatchNorm`](#)) """ -depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; - rev = false, - initβ = Flux.zeros32, initγ = Flux.ones32, - ϵ = 1f-5, momentum = 1f-1, - stride = 1, kwargs...) = - vcat(conv_bn(kernelsize, inplanes, inplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum, - stride = stride, groups = Int(inplanes), kwargs...), - conv_bn((1, 1), inplanes, outplanes, activation; - rev = rev, initβ = initβ, initγ = initγ, - ϵ = ϵ, momentum = momentum)) +function depthwise_sep_conv_bn(kernelsize, inplanes, outplanes, activation = relu; + rev = false, + initβ = Flux.zeros32, initγ = Flux.ones32, + ϵ = 1.0f-5, momentum = 1.0f-1, + stride = 1, kwargs...) + return vcat(conv_bn(kernelsize, inplanes, inplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum, + stride = stride, groups = Int(inplanes), kwargs...), + conv_bn((1, 1), inplanes, outplanes, activation; + rev = rev, initβ = initβ, initγ = initγ, + ϵ = ϵ, momentum = momentum)) +end """ skip_projection(inplanes, outplanes, downsample = false) @@ -97,13 +99,16 @@ Create a skip projection ([reference](https://arxiv.org/abs/1512.03385v1)). # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: the number of output feature maps -- `downsample`: set to `true` to downsample the input + + - `inplanes`: the number of input feature maps + - `outplanes`: the number of output feature maps + - `downsample`: set to `true` to downsample the input """ -skip_projection(inplanes, outplanes, downsample = false) = downsample ? - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : - Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +function skip_projection(inplanes, outplanes, downsample = false) + return downsample ? + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 2, bias = false)) : + Chain(conv_bn((1, 1), inplanes, outplanes, identity; stride = 1, bias = false)) +end # array -> PaddedView(0, array, outplanes) for zero padding arrays """ @@ -113,20 +118,22 @@ Create a identity projection ([reference](https://arxiv.org/abs/1512.03385v1)). # Arguments: -- `inplanes`: the number of input feature maps -- `outplanes`: the number of output feature maps -- `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#). + + - `inplanes`: the number of input feature maps + - `outplanes`: the number of output feature maps + - `downsample`: this argument is ignored but it is needed for compatibility with [`resnet`](#). """ function skip_identity(inplanes, outplanes) - if outplanes > inplanes - return Chain(MaxPool((1, 1), stride = 2), - y -> cat(y, zeros(eltype(y), - size(y, 1), - size(y, 2), - outplanes - inplanes, size(y, 4)); dims = 3)) - else - return identity - end + if outplanes > inplanes + return Chain(MaxPool((1, 1); stride = 2), + y -> cat(y, + zeros(eltype(y), + size(y, 1), + size(y, 2), + outplanes - inplanes, size(y, 4)); dims = 3)) + else + return identity + end end skip_identity(inplanes, outplanes, downsample) = skip_identity(inplanes, outplanes) @@ -137,15 +144,18 @@ Squeeze and excitation layer used by MobileNet variants ([reference](https://arxiv.org/abs/1905.02244)). # Arguments -- `channels`: the number of input/output feature maps -- `reduction = 4`: the reduction factor for the number of hidden feature maps - (must be >= 1) + + - `channels`: the number of input/output feature maps + - `reduction = 4`: the reduction factor for the number of hidden feature maps + (must be >= 1) """ function squeeze_excite(channels, reduction = 4) - @assert (reduction >= 1) "`reduction` must be >= 1" - SkipConnection(Chain(AdaptiveMeanPool((1, 1)), - conv_bn((1, 1), channels, channels ÷ reduction, relu; bias = false)..., - conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), .*) + @assert (reduction>=1) "`reduction` must be >= 1" + return SkipConnection(Chain(AdaptiveMeanPool((1, 1)), + conv_bn((1, 1), channels, channels ÷ reduction, relu; + bias = false)..., + conv_bn((1, 1), channels ÷ reduction, channels, hardσ)...), + .*) end """ @@ -156,31 +166,32 @@ Create a basic inverted residual block for MobileNet variants ([reference](https://arxiv.org/abs/1905.02244)). # Arguments -- `kernel_size`: The kernel size of the convolutional layers -- `inplanes`: The number of input feature maps -- `hidden_planes`: The number of feature maps in the hidden layer -- `outplanes`: The number of output feature maps -- `activation`: The activation function for the first two convolution layer -- `stride`: The stride of the convolutional kernel, has to be either 1 or 2 -- `reduction`: The reduction factor for the number of hidden feature maps - in a squeeze and excite layer (see [`squeeze_excite`](#)). - Must be >= 1 or `nothing` for no squeeze and excite layer. -""" -function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, activation = relu; - stride, reduction = nothing) - @assert stride in [1, 2] "`stride` has to be 1 or 2" - pad = @. (kernel_size - 1) ÷ 2 - conv1 = (inplanes == hidden_planes) ? identity : Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) - selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) - - invres = Chain(conv1, - conv_bn(kernel_size, hidden_planes, hidden_planes, activation; - bias = false, stride, pad = pad, groups = hidden_planes)..., - selayer, - conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) + - `kernel_size`: The kernel size of the convolutional layers + - `inplanes`: The number of input feature maps + - `hidden_planes`: The number of feature maps in the hidden layer + - `outplanes`: The number of output feature maps + - `activation`: The activation function for the first two convolution layer + - `stride`: The stride of the convolutional kernel, has to be either 1 or 2 + - `reduction`: The reduction factor for the number of hidden feature maps + in a squeeze and excite layer (see [`squeeze_excite`](#)). + Must be >= 1 or `nothing` for no squeeze and excite layer. +""" +function invertedresidual(kernel_size, inplanes, hidden_planes, outplanes, + activation = relu; stride, reduction = nothing) + @assert stride in [1, 2] "`stride` has to be 1 or 2" + pad = @. (kernel_size - 1) ÷ 2 + conv1 = (inplanes == hidden_planes) ? identity : + Chain(conv_bn((1, 1), inplanes, hidden_planes, activation; bias = false)) + selayer = isnothing(reduction) ? identity : squeeze_excite(hidden_planes, reduction) + invres = Chain(conv1, + conv_bn(kernel_size, hidden_planes, hidden_planes, activation; + bias = false, stride, pad = pad, groups = hidden_planes)..., + selayer, + conv_bn((1, 1), hidden_planes, outplanes, identity; bias = false)...) + return (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres +end - (stride == 1 && inplanes == outplanes) ? SkipConnection(invres, +) : invres +function invertedresidual(kernel_size::Integer, args...; kwargs...) + return invertedresidual((kernel_size, kernel_size), args...; kwargs...) end -invertedresidual(kernel_size::Integer, args...; kwargs...) = - invertedresidual((kernel_size, kernel_size), args...; kwargs...) diff --git a/src/layers/embeddings.jl b/src/layers/embeddings.jl index 06116bdc2..7c0d4f7e6 100644 --- a/src/layers/embeddings.jl +++ b/src/layers/embeddings.jl @@ -5,31 +5,31 @@ _flatten_spatial(x) = permutedims(reshape(x, (:, size(x, 3), size(x, 4))), (2, 1 patch_size::Dims{2} = (16, 16), embedplanes = 768, norm_layer = planes -> identity, flatten = true) -Patch embedding layer used by many vision transformer-like models to split the input image into +Patch embedding layer used by many vision transformer-like models to split the input image into patches. # Arguments: -- `imsize`: the size of the input image -- `inchannels`: the number of channels in the input image -- `patch_size`: the size of the patches -- `embedplanes`: the number of channels in the embedding -- `norm_layer`: the normalization layer - by default the identity function but otherwise takes a - single argument constructor for a normalization layer like LayerNorm or BatchNorm -- `flatten`: set true to flatten the input spatial dimensions after the embedding + + - `imsize`: the size of the input image + - `inchannels`: the number of channels in the input image + - `patch_size`: the size of the patches + - `embedplanes`: the number of channels in the embedding + - `norm_layer`: the normalization layer - by default the identity function but otherwise takes a + single argument constructor for a normalization layer like LayerNorm or BatchNorm + - `flatten`: set true to flatten the input spatial dimensions after the embedding """ function PatchEmbedding(imsize::Dims{2} = (224, 224); inchannels::Integer = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, norm_layer = planes -> identity, flatten = true) + im_height, im_width = imsize + patch_height, patch_width = patch_size - im_height, im_width = imsize - patch_height, patch_width = patch_size - - @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) - "Image dimensions must be divisible by the patch size." + @assert (im_height % patch_height == 0) && (im_width % patch_width == 0) + "Image dimensions must be divisible by the patch size." - return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), - flatten ? _flatten_spatial : identity, - norm_layer(embedplanes)) + return Chain(Conv(patch_size, inchannels => embedplanes; stride = patch_size), + flatten ? _flatten_spatial : identity, + norm_layer(embedplanes)) end """ @@ -38,11 +38,13 @@ end Positional embedding layer used by many vision transformer-like models. """ struct ViPosEmbedding{T} - vectors::T + vectors::T end -ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> rand(Float32, dims)) = - ViPosEmbedding(init((embedsize, npatches))) +function ViPosEmbedding(embedsize::Integer, npatches::Integer; + init = (dims::Dims{2}) -> rand(Float32, dims)) + return ViPosEmbedding(init((embedsize, npatches))) +end (p::ViPosEmbedding)(x) = x .+ p.vectors @@ -54,7 +56,7 @@ ViPosEmbedding(embedsize::Integer, npatches::Integer; init = (dims::Dims{2}) -> Appends class tokens to an input with embedding dimension `dim` for use in many vision transformer models. """ struct ClassTokens{T} - token::T + token::T end ClassTokens(dim::Integer; init = Flux.zeros32) = ClassTokens(init(dim, 1, 1)) diff --git a/src/layers/mlp.jl b/src/layers/mlp.jl index ca8f38f97..25ead874b 100644 --- a/src/layers/mlp.jl +++ b/src/layers/mlp.jl @@ -5,16 +5,17 @@ Feedforward block used in many MLPMixer-like and vision-transformer models. # Arguments -- `inplanes`: Number of dimensions in the input. -- `hidden_planes`: Number of dimensions in the intermediate layer. -- `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`. -- `dropout`: Dropout rate. -- `activation`: Activation function to use. + + - `inplanes`: Number of dimensions in the input. + - `hidden_planes`: Number of dimensions in the intermediate layer. + - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`. + - `dropout`: Dropout rate. + - `activation`: Activation function to use. """ -function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; - dropout = 0., activation = gelu) - Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), - Dense(hidden_planes, outplanes), Dropout(dropout)) +function mlp_block(inplanes::Integer, hidden_planes::Integer, outplanes::Integer = inplanes; + dropout = 0.0, activation = gelu) + return Chain(Dense(inplanes, hidden_planes, activation), Dropout(dropout), + Dense(hidden_planes, outplanes), Dropout(dropout)) end """ @@ -25,20 +26,21 @@ Feedforward block based on the implementation in the paper "Pay Attention to MLP ([reference](https://arxiv.org/abs/2105.08050)) # Arguments -- `gate_layer`: Layer to use for the gating. -- `inplanes`: Number of dimensions in the input. -- `hidden_planes`: Number of dimensions in the intermediate layer. -- `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`. -- `dropout`: Dropout rate. -- `activation`: Activation function to use. + + - `gate_layer`: Layer to use for the gating. + - `inplanes`: Number of dimensions in the input. + - `hidden_planes`: Number of dimensions in the intermediate layer. + - `outplanes`: Number of dimensions in the output - by default it is the same as `inplanes`. + - `dropout`: Dropout rate. + - `activation`: Activation function to use. """ function gated_mlp_block(gate_layer, inplanes::Integer, hidden_planes::Integer, - outplanes::Integer = inplanes; dropout = 0., activation = gelu) - @assert hidden_planes % 2 == 0 "`hidden_planes` must be even for gated MLP" - return Chain(Dense(inplanes, hidden_planes, activation), - Dropout(dropout), - gate_layer(hidden_planes), - Dense(hidden_planes ÷ 2, outplanes), - Dropout(dropout)) + outplanes::Integer = inplanes; dropout = 0.0, activation = gelu) + @assert hidden_planes % 2==0 "`hidden_planes` must be even for gated MLP" + return Chain(Dense(inplanes, hidden_planes, activation), + Dropout(dropout), + gate_layer(hidden_planes), + Dense(hidden_planes ÷ 2, outplanes), + Dropout(dropout)) end gated_mlp_block(::typeof(identity), args...; kwargs...) = mlp_block(args...; kwargs...) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index a7bce3e6c..4f69dab03 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -5,23 +5,23 @@ prenorm(planes, fn) = Chain(LayerNorm(planes), fn) ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5) A variant of LayerNorm where the input is normalised along the -channel dimension. The input is expected to have channel dimension with size +channel dimension. The input is expected to have channel dimension with size `sz`. It also applies a learnable shift and rescaling after the normalization. Note that this is specifically for inputs with 4 dimensions in the format (H, W, C, N) where H, W are the height and width of the input, C is the number of channels, and N is the batch size. """ -struct ChannelLayerNorm{D,T} - diag::D - ϵ::T +struct ChannelLayerNorm{D, T} + diag::D + ϵ::T end @functor ChannelLayerNorm -(m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x, dims = ndims(x) - 1, ϵ = m.ϵ)) +(m::ChannelLayerNorm)(x) = m.diag(MLUtils.normalise(x; dims = ndims(x) - 1, ϵ = m.ϵ)) -function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1f-5) - diag = Flux.Scale(1, 1, sz, λ) - return ChannelLayerNorm(diag, ϵ) +function ChannelLayerNorm(sz::Integer, λ = identity; ϵ = 1.0f-5) + diag = Flux.Scale(1, 1, sz, λ) + return ChannelLayerNorm(diag, ϵ) end diff --git a/src/layers/others.jl b/src/layers/others.jl index 366b273e4..770bccebd 100644 --- a/src/layers/others.jl +++ b/src/layers/others.jl @@ -5,11 +5,13 @@ Creates a `Flux.Scale` layer that performs "`LayerScale`" ([reference](https://arxiv.org/abs/2103.17239)). # Arguments -- `planes`: Size of channel dimension in the input. -- `λ`: initialisation value for the learnable diagonal matrix. + + - `planes`: Size of channel dimension in the input. + - `λ`: initialisation value for the learnable diagonal matrix. """ -LayerScale(planes::Integer, λ) = - λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity +function LayerScale(planes::Integer, λ) + return λ > 0 ? Flux.Scale(fill(Float32(λ), planes), false) : identity +end """ DropPath(p) @@ -18,6 +20,7 @@ Implements Stochastic Depth - equivalent to `Dropout(p; dims = 4)` when `p` ≥ ([reference](https://arxiv.org/abs/1603.09382)) # Arguments -- `p`: rate of Stochastic Depth. + + - `p`: rate of Stochastic Depth. """ -DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity \ No newline at end of file +DropPath(p) = p ≥ 0 ? Dropout(p; dims = 4) : identity diff --git a/src/other/mlpmixer.jl b/src/other/mlpmixer.jl index 880486dc2..942abc823 100644 --- a/src/other/mlpmixer.jl +++ b/src/other/mlpmixer.jl @@ -6,26 +6,27 @@ Creates a feedforward block for the MLPMixer architecture. ([reference](https://arxiv.org/pdf/2105.01601)) # Arguments: -- `planes`: the number of planes in the block -- `npatches`: the number of patches of the input -- `mlp_ratio`: number(s) that determine(s) the number of hidden channels in the token mixing MLP - and/or the channel mixing MLP as a ratio to the number of planes in the block. -- `mlp_layer`: the MLP layer to use in the block -- `dropout`: the dropout rate to use in the MLP blocks -- `drop_path_rate`: Stochastic depth rate -- `activation`: the activation function to use in the MLP blocks + + - `planes`: the number of planes in the block + - `npatches`: the number of patches of the input + - `mlp_ratio`: number(s) that determine(s) the number of hidden channels in the token mixing MLP + and/or the channel mixing MLP as a ratio to the number of planes in the block. + - `mlp_layer`: the MLP layer to use in the block + - `dropout`: the dropout rate to use in the MLP blocks + - `drop_path_rate`: Stochastic depth rate + - `activation`: the activation function to use in the MLP blocks """ -function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu) - tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] - return Chain(SkipConnection(Chain(LayerNorm(planes), - swapdims((2, 1, 3)), - mlp_layer(npatches, tokenplanes; activation, dropout), - swapdims((2, 1, 3)), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(LayerNorm(planes), - mlp_layer(planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +)) +function mixerblock(planes, npatches; mlp_ratio = (0.5, 4.0), mlp_layer = mlp_block, + dropout = 0.0, drop_path_rate = 0.0, activation = gelu) + tokenplanes, channelplanes = [Int(r * planes) for r in mlp_ratio] + return Chain(SkipConnection(Chain(LayerNorm(planes), + swapdims((2, 1, 3)), + mlp_layer(npatches, tokenplanes; activation, dropout), + swapdims((2, 1, 3)), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(LayerNorm(planes), + mlp_layer(planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +)) end """ @@ -37,40 +38,44 @@ Creates a model with the MLPMixer architecture. ([reference](https://arxiv.org/pdf/2105.01601)). # Arguments -- `block`: the type of mixer block to use in the model - architecture dependent - (a constructor of the form `block(embedplanes, npatches; drop_path_rate, kwargs...)`) -- `imsize`: the size of the input image -- `inchannels`: the number of input channels -- `norm_layer`: the normalization layer to use in the model -- `patch_size`: the size of the patches -- `embedplanes`: the number of channels after the patch embedding (denotes the hidden dimension) -- `drop_path_rate`: Stochastic depth rate -- `depth`: the number of blocks in the model -- `nclasses`: number of output classes -- `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if - not specified. + + - `block`: the type of mixer block to use in the model - architecture dependent + (a constructor of the form `block(embedplanes, npatches; drop_path_rate, kwargs...)`) + - `imsize`: the size of the input image + - `inchannels`: the number of input channels + - `norm_layer`: the normalization layer to use in the model + - `patch_size`: the size of the patches + - `embedplanes`: the number of channels after the patch embedding (denotes the hidden dimension) + - `drop_path_rate`: Stochastic depth rate + - `depth`: the number of blocks in the model + - `nclasses`: number of output classes + - `kwargs`: additional arguments (if any) to pass to the mixer block. Will use the defaults if + not specified. """ -function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, norm_layer = LayerNorm, - patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0., +function mlpmixer(block, imsize::Dims{2} = (224, 224); inchannels = 3, + norm_layer = LayerNorm, + patch_size::Dims{2} = (16, 16), embedplanes = 512, drop_path_rate = 0.0, depth = 12, nclasses = 1000, kwargs...) - npatches = prod(imsize .÷ patch_size) - dp_rates = LinRange{Float32}(0., drop_path_rate, depth) - layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], kwargs...) - for i in 1:depth])) - - classification_head = Chain(norm_layer(embedplanes), seconddimmean, Dense(embedplanes, nclasses)) - return Chain(layers, classification_head) + npatches = prod(imsize .÷ patch_size) + dp_rates = LinRange{Float32}(0.0, drop_path_rate, depth) + layers = Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + Chain([block(embedplanes, npatches; drop_path_rate = dp_rates[i], + kwargs...) + for i in 1:depth])) + + classification_head = Chain(norm_layer(embedplanes), seconddimmean, + Dense(embedplanes, nclasses)) + return Chain(layers, classification_head) end # Configurations for MLPMixer models -mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), - :base => Dict(:depth => 12, :planes => 768), +mixer_configs = Dict(:small => Dict(:depth => 8, :planes => 512), + :base => Dict(:depth => 12, :planes => 768), :large => Dict(:depth => 24, :planes => 1024), - :huge => Dict(:depth => 32, :planes => 1280)) + :huge => Dict(:depth => 32, :planes => 1280)) struct MLPMixer - layers + layers::Any end """ @@ -81,21 +86,23 @@ Creates a model with the MLPMixer architecture. ([reference](https://arxiv.org/pdf/2105.01601)). # Arguments -- `size`: the size of the model - one of `small`, `base`, `large` or `huge` -- `patch_size`: the size of the patches -- `imsize`: the size of the input image -- `drop_path_rate`: Stochastic depth rate -- `nclasses`: number of output classes + + - `size`: the size of the model - one of `small`, `base`, `large` or `huge` + - `patch_size`: the size of the patches + - `imsize`: the size of the input image + - `drop_path_rate`: Stochastic depth rate + - `nclasses`: number of output classes See also [`Metalhead.mlpmixer`](#). """ function MLPMixer(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, nclasses) - MLPMixer(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(mixerblock, imsize; patch_size, embedplanes, depth, drop_path_rate, + nclasses) + return MLPMixer(layers) end @functor MLPMixer @@ -113,32 +120,34 @@ Creates a block for the ResMixer architecture. ([reference](https://arxiv.org/abs/2105.03404)). # Arguments -- `planes`: the number of planes in the block -- `npatches`: the number of patches of the input -- `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number - of planes in the block -- `mlp_layer`: the MLP block to use -- `dropout`: the dropout rate to use in the MLP blocks -- `drop_path_rate`: Stochastic depth rate -- `activation`: the activation function to use in the MLP blocks -- `λ`: initialisation constant for the LayerScale + + - `planes`: the number of planes in the block + - `npatches`: the number of patches of the input + - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number + of planes in the block + - `mlp_layer`: the MLP block to use + - `dropout`: the dropout rate to use in the MLP blocks + - `drop_path_rate`: Stochastic depth rate + - `activation`: the activation function to use in the MLP blocks + - `λ`: initialisation constant for the LayerScale """ function resmixerblock(planes, npatches; mlp_ratio = 4.0, mlp_layer = mlp_block, - dropout = 0., drop_path_rate = 0., activation = gelu, λ = 1e-4) -return Chain(SkipConnection(Chain(Flux.Scale(planes), - swapdims((2, 1, 3)), - Dense(npatches, npatches), - swapdims((2, 1, 3)), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +), - SkipConnection(Chain(Flux.Scale(planes), - mlp_layer(planes, Int(mlp_ratio * planes); dropout, activation), - LayerScale(planes, λ), - DropPath(drop_path_rate)), +)) + dropout = 0.0, drop_path_rate = 0.0, activation = gelu, λ = 1e-4) + return Chain(SkipConnection(Chain(Flux.Scale(planes), + swapdims((2, 1, 3)), + Dense(npatches, npatches), + swapdims((2, 1, 3)), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +), + SkipConnection(Chain(Flux.Scale(planes), + mlp_layer(planes, Int(mlp_ratio * planes); dropout, + activation), + LayerScale(planes, λ), + DropPath(drop_path_rate)), +)) end struct ResMLP - layers + layers::Any end """ @@ -149,22 +158,23 @@ Creates a model with the ResMLP architecture. ([reference](https://arxiv.org/abs/2105.03404)). # Arguments -- `size`: the size of the model - one of `small`, `base`, `large` or `huge` -- `patch_size`: the size of the patches -- `imsize`: the size of the input image -- `drop_path_rate`: Stochastic depth rate -- `nclasses`: number of output classes + + - `size`: the size of the model - one of `small`, `base`, `large` or `huge` + - `patch_size`: the size of the patches + - `imsize`: the size of the input image + - `drop_path_rate`: Stochastic depth rate + - `nclasses`: number of output classes See also [`Metalhead.mlpmixer`](#). """ function ResMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, - drop_path_rate, depth, nclasses) - ResMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(resmixerblock, imsize; mlp_ratio = 4.0, patch_size, embedplanes, + drop_path_rate, depth, nclasses) + return ResMLP(layers) end @functor ResMLP @@ -179,14 +189,15 @@ classifier(m::ResMLP) = m.layers[2] Creates a spatial gating unit as described in the gMLP paper. ([reference](https://arxiv.org/abs/2105.08050)) - + # Arguments -- `norm`: the normalisation layer to use -- `proj`: the projection layer to use + + - `norm`: the normalisation layer to use + - `proj`: the projection layer to use """ struct SpatialGatingUnit{T, F} - norm::T - proj::F + norm::T + proj::F end """ @@ -196,24 +207,25 @@ Creates a spatial gating unit as described in the gMLP paper. ([reference](https://arxiv.org/abs/2105.08050)) # Arguments -- `planes`: the number of planes in the block -- `npatches`: the number of patches of the input -- `norm_layer`: the normalisation layer to use + + - `planes`: the number of planes in the block + - `npatches`: the number of patches of the input + - `norm_layer`: the normalisation layer to use """ function SpatialGatingUnit(planes::Integer, npatches::Integer; norm_layer = LayerNorm) - gateplanes = planes ÷ 2 - norm = norm_layer(gateplanes) - proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) - return SpatialGatingUnit(norm, proj) + gateplanes = planes ÷ 2 + norm = norm_layer(gateplanes) + proj = Dense(2 * eps(Float32) .* rand(Float32, npatches, npatches), ones(npatches)) + return SpatialGatingUnit(norm, proj) end @functor SpatialGatingUnit function (m::SpatialGatingUnit)(x) - u, v = chunk(x, 2; dims = 1) - v = m.norm(v) - v = m.proj(permutedims(v, (2, 1, 3))) - return u .* permutedims(v, (2, 1, 3)) + u, v = chunk(x, 2; dims = 1) + v = m.norm(v) + v = m.proj(permutedims(v, (2, 1, 3))) + return u .* permutedims(v, (2, 1, 3)) end """ @@ -225,27 +237,29 @@ Creates a feedforward block based on the gMLP model architecture described in th ([reference](https://arxiv.org/abs/2105.08050)) # Arguments -- `planes`: the number of planes in the block -- `npatches`: the number of patches of the input -- `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number - of planes in the block -- `norm_layer`: the normalisation layer to use -- `dropout`: the dropout rate to use in the MLP blocks -- `drop_path_rate`: Stochastic depth rate -- `activation`: the activation function to use in the MLP blocks + + - `planes`: the number of planes in the block + - `npatches`: the number of patches of the input + - `mlp_ratio`: ratio of the number of hidden channels in the channel mixing MLP to the number + of planes in the block + - `norm_layer`: the normalisation layer to use + - `dropout`: the dropout rate to use in the MLP blocks + - `drop_path_rate`: Stochastic depth rate + - `activation`: the activation function to use in the MLP blocks """ function spatial_gating_block(planes, npatches; mlp_ratio = 4.0, norm_layer = LayerNorm, - mlp_layer = gated_mlp_block, dropout = 0., drop_path_rate = 0., + mlp_layer = gated_mlp_block, dropout = 0.0, + drop_path_rate = 0.0, activation = gelu) - channelplanes = Int(mlp_ratio * planes) - sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) - return SkipConnection(Chain(norm_layer(planes), - mlp_layer(sgu, planes, channelplanes; activation, dropout), - DropPath(drop_path_rate)), +) + channelplanes = Int(mlp_ratio * planes) + sgu = inplanes -> SpatialGatingUnit(inplanes, npatches; norm_layer) + return SkipConnection(Chain(norm_layer(planes), + mlp_layer(sgu, planes, channelplanes; activation, dropout), + DropPath(drop_path_rate)), +) end struct gMLP - layers + layers::Any end """ @@ -256,23 +270,23 @@ Creates a model with the gMLP architecture. ([reference](https://arxiv.org/abs/2105.08050)). # Arguments -- `size`: the size of the model - one of `small`, `base`, `large` or `huge` -- `patch_size`: the size of the patches -- `imsize`: the size of the input image -- `drop_path_rate`: Stochastic depth rate -- `nclasses`: number of output classes + + - `size`: the size of the model - one of `small`, `base`, `large` or `huge` + - `patch_size`: the size of the patches + - `imsize`: the size of the input image + - `drop_path_rate`: Stochastic depth rate + - `nclasses`: number of output classes See also [`Metalhead.mlpmixer`](#). """ function gMLP(size::Symbol = :base; patch_size::Dims{2} = (16, 16), - imsize::Dims{2} = (224, 224), drop_path_rate = 0., nclasses = 1000) - @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" - depth = mixer_configs[size][:depth] - embedplanes = mixer_configs[size][:planes] - layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, - patch_size, embedplanes, drop_path_rate, depth, nclasses) - - gMLP(layers) + imsize::Dims{2} = (224, 224), drop_path_rate = 0.0, nclasses = 1000) + @assert size in keys(mixer_configs) "`size` must be one of $(keys(mixer_configs))" + depth = mixer_configs[size][:depth] + embedplanes = mixer_configs[size][:planes] + layers = mlpmixer(spatial_gating_block, imsize; mlp_layer = gated_mlp_block, + patch_size, embedplanes, drop_path_rate, depth, nclasses) + return gMLP(layers) end @functor gMLP diff --git a/src/pretrain.jl b/src/pretrain.jl index 97ab7398e..24e6d176d 100644 --- a/src/pretrain.jl +++ b/src/pretrain.jl @@ -4,17 +4,17 @@ Load the pre-trained weights for `model` using the stored artifacts. """ function weights(model) - try - path = joinpath(@artifact_str(model), "$model.bson") - artifact = BSON.load(path, @__MODULE__) - if haskey(artifact, :model) - return artifact[:model] - else - throw(ArgumentError("No pre-trained weights available for $model.")) + try + path = joinpath(@artifact_str(model), "$model.bson") + artifact = BSON.load(path, @__MODULE__) + if haskey(artifact, :model) + return artifact[:model] + else + throw(ArgumentError("No pre-trained weights available for $model.")) + end + catch e + throw(ArgumentError("No pre-trained weights available for $model.")) end - catch e - throw(ArgumentError("No pre-trained weights available for $model.")) - end end """ diff --git a/src/utilities.jl b/src/utilities.jl index 39dbdd3b2..dd3f2ed74 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -1,12 +1,12 @@ # Utility function for classifier head of vision transformer-like models -seconddimmean(x) = dropdims(mean(x, dims = 2); dims = 2) +seconddimmean(x) = dropdims(mean(x; dims = 2); dims = 2) # utility function for making sure that all layers have a channel size divisible by 8 # used by MobileNet variants function _round_channels(channels, divisor, min_value = divisor) - new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) - # Make sure that round down does not go down by more than 10% - return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels + new_channels = max(min_value, floor(Int, channels + divisor / 2) ÷ divisor * divisor) + # Make sure that round down does not go down by more than 10% + return (new_channels < 0.9 * channels) ? new_channels + divisor : new_channels end """ @@ -47,11 +47,11 @@ swapdims(perm) = Base.Fix2(permutedims, perm) # Utility function for pretty printing large models function _maybe_big_show(io, model) - if isdefined(Flux, :_big_show) - if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL - Flux._big_show(io, model) - else - show(io, model) + if isdefined(Flux, :_big_show) + if isnothing(get(io, :typeinfo, nothing)) # e.g. top level in REPL + Flux._big_show(io, model) + else + show(io, model) + end end - end end diff --git a/src/vit-based/vit.jl b/src/vit-based/vit.jl index 55b3e3d30..dffc93ccf 100644 --- a/src/vit-based/vit.jl +++ b/src/vit-based/vit.jl @@ -1,23 +1,26 @@ """ - transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.) +transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.) Transformer as used in the base ViT architecture. ([reference](https://arxiv.org/abs/2010.11929)). # Arguments -- `planes`: number of input channels -- `depth`: number of attention blocks -- `nheads`: number of attention heads -- `mlp_ratio`: ratio of MLP layers to the number of input channels -- `dropout`: dropout rate + + - `planes`: number of input channels + - `depth`: number of attention blocks + - `nheads`: number of attention heads + - `mlp_ratio`: ratio of MLP layers to the number of input channels + - `dropout`: dropout rate """ -function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.) - layers = [Chain(SkipConnection(prenorm(planes, MHAttention(planes, nheads; attn_drop = dropout, - proj_drop = dropout)), +), - SkipConnection(prenorm(planes, mlp_block(planes, floor(Int, mlp_ratio * planes); - dropout)), +)) - for _ in 1:depth] - Chain(layers) +function transformer_encoder(planes, depth, nheads; mlp_ratio = 4.0, dropout = 0.0) + layers = [Chain(SkipConnection(prenorm(planes, + MHAttention(planes, nheads; attn_drop = dropout, + proj_drop = dropout)), +), + SkipConnection(prenorm(planes, + mlp_block(planes, floor(Int, mlp_ratio * planes); + dropout)), +)) + for _ in 1:depth] + return Chain(layers) end """ @@ -29,32 +32,32 @@ Creates a Vision Transformer (ViT) model. ([reference](https://arxiv.org/abs/2010.11929)). # Arguments -- `imsize`: image size -- `inchannels`: number of input channels -- `patch_size`: size of the patches -- `embedplanes`: the number of channels after the patch embedding -- `depth`: number of blocks in the transformer -- `nheads`: number of attention heads in the transformer -- `mlpplanes`: number of hidden channels in the MLP block in the transformer -- `dropout`: dropout rate -- `emb_dropout`: dropout rate for the positional embedding layer -- `pool`: pooling type, either :class or :mean -- `nclasses`: number of classes in the output + + - `imsize`: image size + - `inchannels`: number of input channels + - `patch_size`: size of the patches + - `embedplanes`: the number of channels after the patch embedding + - `depth`: number of blocks in the transformer + - `nheads`: number of attention heads in the transformer + - `mlpplanes`: number of hidden channels in the MLP block in the transformer + - `dropout`: dropout rate + - `emb_dropout`: dropout rate for the positional embedding layer + - `pool`: pooling type, either :class or :mean + - `nclasses`: number of classes in the output """ function vit(imsize::Dims{2} = (256, 256); inchannels = 3, patch_size::Dims{2} = (16, 16), embedplanes = 768, depth = 6, nheads = 16, mlp_ratio = 4.0, dropout = 0.1, emb_dropout = 0.1, pool = :class, nclasses = 1000) - - @assert pool in [:class, :mean] - "Pool type must be either :class (class token) or :mean (mean pooling)" - npatches = prod(imsize .÷ patch_size) - return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), - ClassTokens(embedplanes), - ViPosEmbedding(embedplanes, npatches + 1), - Dropout(emb_dropout), - transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), - (pool == :class) ? x -> x[:, 1, :] : seconddimmean), - Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) + @assert pool in [:class, :mean] + "Pool type must be either :class (class token) or :mean (mean pooling)" + npatches = prod(imsize .÷ patch_size) + return Chain(Chain(PatchEmbedding(imsize; inchannels, patch_size, embedplanes), + ClassTokens(embedplanes), + ViPosEmbedding(embedplanes, npatches + 1), + Dropout(emb_dropout), + transformer_encoder(embedplanes, depth, nheads; mlp_ratio, dropout), + (pool == :class) ? x -> x[:, 1, :] : seconddimmean), + Chain(LayerNorm(embedplanes), Dense(embedplanes, nclasses, tanh_fast))) end vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), @@ -62,8 +65,10 @@ vit_configs = Dict(:tiny => (depth = 12, embedplanes = 192, nheads = 3), :base => (depth = 12, embedplanes = 768, nheads = 12), :large => (depth = 24, embedplanes = 1024, nheads = 16), :huge => (depth = 32, embedplanes = 1280, nheads = 16), - :giant => (depth = 40, embedplanes = 1408, nheads = 16, mlp_ratio = 48/11), - :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, mlp_ratio = 64/13)) + :giant => (depth = 40, embedplanes = 1408, nheads = 16, + mlp_ratio = 48 // 11), + :gigantic => (depth = 48, embedplanes = 1664, nheads = 16, + mlp_ratio = 64 // 13)) """ ViT(mode::Symbol = base; imsize::Dims{2} = (256, 256), inchannels = 3, @@ -73,26 +78,27 @@ Creates a Vision Transformer (ViT) model. ([reference](https://arxiv.org/abs/2010.11929)). # Arguments -- `mode`: the model configuration, one of [:tiny, :small, :base, :large, :huge, :giant, :gigantic] -- `imsize`: image size -- `inchannels`: number of input channels -- `patch_size`: size of the patches -- `pool`: pooling type, either :class or :mean -- `nclasses`: number of classes in the output + + - `mode`: the model configuration, one of [:tiny, :small, :base, :large, :huge, :giant, :gigantic] + - `imsize`: image size + - `inchannels`: number of input channels + - `patch_size`: size of the patches + - `pool`: pooling type, either :class or :mean + - `nclasses`: number of classes in the output See also [`Metalhead.vit`](#). """ struct ViT - layers + layers::Any end function ViT(mode::Symbol = :base; imsize::Dims{2} = (256, 256), inchannels = 3, patch_size::Dims{2} = (16, 16), pool = :class, nclasses = 1000) - @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" - kwargs = vit_configs[mode] - layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) + @assert mode in keys(vit_configs) "`mode` must be one of $(keys(vit_configs))" + kwargs = vit_configs[mode] + layers = vit(imsize; inchannels, patch_size, nclasses, pool, kwargs...) - ViT(layers) + return ViT(layers) end (m::ViT)(x) = m.layers(x) diff --git a/test/convnets.jl b/test/convnets.jl index 3540c3e9f..3a26477cf 100644 --- a/test/convnets.jl +++ b/test/convnets.jl @@ -5,202 +5,194 @@ using Flux PRETRAINED_MODELS = [] @testset "AlexNet" begin - model = AlexNet() - @test size(model(x_256)) == (1000, 1) - @test_throws ArgumentError AlexNet(pretrain = true) - @test gradtest(model, x_256) + model = AlexNet() + @test size(model(x_256)) == (1000, 1) + @test_throws ArgumentError AlexNet(pretrain = true) + @test gradtest(model, x_256) end GC.safepoint() GC.gc() @testset "VGG" begin - @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false] - m = VGG(sz, batchnorm = bn) - - @test size(m(x_224)) == (1000, 1) - if (VGG, sz, bn) in PRETRAINED_MODELS - @test (VGG(sz, batchnorm = bn, pretrain = true); true) - else - @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) + @testset "VGG($sz, batchnorm=$bn)" for sz in [11, 13, 16, 19], bn in [true, false] + m = VGG(sz, batchnorm = bn) + @test size(m(x_224)) == (1000, 1) + if (VGG, sz, bn) in PRETRAINED_MODELS + @test (VGG(sz, batchnorm = bn, pretrain = true); true) + else + @test_throws ArgumentError VGG(sz, batchnorm = bn, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end end GC.safepoint() GC.gc() @testset "ResNet" begin - @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] - m = ResNet(sz) - - @test size(m(x_256)) == (1000, 1) - if (ResNet, sz) in PRETRAINED_MODELS - @test (ResNet(sz, pretrain = true); true) - else - @test_throws ArgumentError ResNet(sz, pretrain = true) + @testset "ResNet($sz)" for sz in [18, 34, 50, 101, 152] + m = ResNet(sz) + @test size(m(x_256)) == (1000, 1) + if (ResNet, sz) in PRETRAINED_MODELS + @test (ResNet(sz, pretrain = true); true) + else + @test_throws ArgumentError ResNet(sz, pretrain = true) + end + @test gradtest(m, x_256) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_256) - GC.safepoint() - GC.gc() - end - - @testset "Shortcut C" begin - m = Metalhead.resnet(Metalhead.basicblock, :C; - channel_config = [1, 1], - block_config = [2, 2, 2, 2]) - @test size(m(x_256)) == (1000, 1) - @test gradtest(m, x_256) - end + @testset "Shortcut C" begin + m = Metalhead.resnet(Metalhead.basicblock, :C; + channel_config = [1, 1], + block_config = [2, 2, 2, 2]) + @test size(m(x_256)) == (1000, 1) + @test gradtest(m, x_256) + end end GC.safepoint() GC.gc() @testset "ResNeXt" begin - @testset for depth in [50, 101, 152] - m = ResNeXt(depth) - - @test size(m(x_224)) == (1000, 1) - if ResNeXt in PRETRAINED_MODELS - @test (ResNeXt(depth, pretrain = true); true) - else - @test_throws ArgumentError ResNeXt(depth, pretrain = true) + @testset for depth in [50, 101, 152] + m = ResNeXt(depth) + @test size(m(x_224)) == (1000, 1) + if ResNeXt in PRETRAINED_MODELS + @test (ResNeXt(depth, pretrain = true); true) + else + @test_throws ArgumentError ResNeXt(depth, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end end GC.safepoint() GC.gc() @testset "GoogLeNet" begin - m = GoogLeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (GoogLeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = GoogLeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (GoogLeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "Inception3" begin - m = Inception3() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError Inception3(pretrain = true) - @test gradtest(m, x_224) + m = Inception3() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError Inception3(pretrain = true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "SqueezeNet" begin - m = SqueezeNet() - @test size(m(x_224)) == (1000, 1) - @test_throws ArgumentError (SqueezeNet(pretrain = true); true) - @test gradtest(m, x_224) + m = SqueezeNet() + @test size(m(x_224)) == (1000, 1) + @test_throws ArgumentError (SqueezeNet(pretrain = true); true) + @test gradtest(m, x_224) end GC.safepoint() GC.gc() @testset "DenseNet" begin - @testset for sz in [121, 161, 169, 201] - m = DenseNet(sz) - - @test size(m(x_224)) == (1000, 1) - if (DenseNet, sz) in PRETRAINED_MODELS - @test (DenseNet(sz, pretrain = true); true) - else - @test_throws ArgumentError DenseNet(sz, pretrain = true) + @testset for sz in [121, 161, 169, 201] + m = DenseNet(sz) + @test size(m(x_224)) == (1000, 1) + if (DenseNet, sz) in PRETRAINED_MODELS + @test (DenseNet(sz, pretrain = true); true) + else + @test_throws ArgumentError DenseNet(sz, pretrain = true) + end + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() end - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end end GC.safepoint() GC.gc() @testset "MobileNet" verbose = true begin - @testset "MobileNetv1" begin - m = MobileNetv1() - - @test size(m(x_224)) == (1000, 1) - if MobileNetv1 in PRETRAINED_MODELS - @test (MobileNetv1(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv1(pretrain = true) + @testset "MobileNetv1" begin + m = MobileNetv1() + @test size(m(x_224)) == (1000, 1) + if MobileNetv1 in PRETRAINED_MODELS + @test (MobileNetv1(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv1(pretrain = true) + end + @test gradtest(m, x_224) end - @test gradtest(m, x_224) - end - GC.safepoint() - GC.gc() + GC.safepoint() + GC.gc() + + @testset "MobileNetv2" begin + m = MobileNetv2() + @test size(m(x_224)) == (1000, 1) + if MobileNetv2 in PRETRAINED_MODELS + @test (MobileNetv2(pretrain = true); true) + else + @test_throws ArgumentError MobileNetv2(pretrain = true) + end + @test gradtest(m, x_224) + end - @testset "MobileNetv2" begin - m = MobileNetv2() + GC.safepoint() + GC.gc() - @test size(m(x_224)) == (1000, 1) - if MobileNetv2 in PRETRAINED_MODELS - @test (MobileNetv2(pretrain = true); true) - else - @test_throws ArgumentError MobileNetv2(pretrain = true) + @testset "MobileNetv3" verbose = true begin + @testset for mode in [:small, :large] + m = MobileNetv3(mode) + + @test size(m(x_224)) == (1000, 1) + if MobileNetv3 in PRETRAINED_MODELS + @test (MobileNetv3(mode; pretrain = true); true) + else + @test_throws ArgumentError MobileNetv3(mode; pretrain = true) + end + @test gradtest(m, x_224) + end end - @test gradtest(m, x_224) - end - - GC.safepoint() - GC.gc() - - @testset "MobileNetv3" verbose = true begin - @testset for mode in [:small, :large] - m = MobileNetv3(mode) - - @test size(m(x_224)) == (1000, 1) - if MobileNetv3 in PRETRAINED_MODELS - @test (MobileNetv3(mode; pretrain = true); true) - else - @test_throws ArgumentError MobileNetv3(mode; pretrain = true) - end - @test gradtest(m, x_224) end - end -end - -GC.safepoint() -GC.gc() -@testset "ConvNeXt" verbose = true begin - @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] - @testset for drop_path_rate in [0.0, 0.5] - m = ConvNeXt(mode; drop_path_rate) + GC.safepoint() + GC.gc() - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end - end + @testset "ConvNeXt" verbose = true begin + @testset for mode in [:small, :base, :large] # :tiny, #, :xlarge] + @testset for drop_path_rate in [0.0, 0.5] + m = ConvNeXt(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end + end end GC.safepoint() GC.gc() @testset "ConvMixer" verbose = true begin - @testset for mode in [:small, :base, :large] - m = ConvMixer(mode) + @testset for mode in [:small, :base, :large] + m = ConvMixer(mode) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end diff --git a/test/other.jl b/test/other.jl index 0162bc4bc..ae964d6d1 100644 --- a/test/other.jl +++ b/test/other.jl @@ -2,37 +2,37 @@ using Metalhead, Test using Flux @testset "MLPMixer" begin - @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = MLPMixer(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end - end + @testset for mode in [:small, :base, :large] # :huge] + @testset for drop_path_rate in [0.0, 0.5] + m = MLPMixer(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end + end end @testset "ResMLP" begin @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = ResMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() - end + @testset for drop_path_rate in [0.0, 0.5] + m = ResMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end end @testset "gMLP" begin - @testset for mode in [:small, :base, :large] # :huge] - @testset for drop_path_rate in [0.0, 0.5] - m = gMLP(mode; drop_path_rate) - @test size(m(x_224)) == (1000, 1) - @test gradtest(m, x_224) - GC.safepoint() - GC.gc() + @testset for mode in [:small, :base, :large] # :huge] + @testset for drop_path_rate in [0.0, 0.5] + m = gMLP(mode; drop_path_rate) + @test size(m(x_224)) == (1000, 1) + @test gradtest(m, x_224) + GC.safepoint() + GC.gc() + end end - end end diff --git a/test/runtests.jl b/test/runtests.jl index 6dd4a1aa4..610cbf40e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,11 +3,11 @@ using Flux using Flux: Zygote function gradtest(model, input) - y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) - gs = pb(ones(Float32, size(y))) + y, pb = Zygote.pullback(() -> model(input), Flux.params(model)) + gs = pb(ones(Float32, size(y))) - # if we make it to here with no error, success! - return true + # if we make it to here with no error, success! + return true end x_224 = rand(Float32, 224, 224, 3, 1) @@ -15,7 +15,7 @@ x_256 = rand(Float32, 256, 256, 3, 1) # CNN tests @testset verbose = true "ConvNets" begin - include("convnets.jl") + include("convnets.jl") end GC.safepoint() @@ -23,7 +23,7 @@ GC.gc() # Other tests @testset verbose = true "Other" begin - include("other.jl") + include("other.jl") end GC.safepoint() @@ -31,5 +31,5 @@ GC.gc() # ViT tests @testset verbose = true "ViTs" begin - include("vit-based.jl") + include("vit-based.jl") end diff --git a/test/vit-based.jl b/test/vit-based.jl index 20b6ecb86..cdaffc430 100644 --- a/test/vit-based.jl +++ b/test/vit-based.jl @@ -2,11 +2,11 @@ using Metalhead, Test using Flux @testset "ViT" begin - for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] - m = ViT(mode) - @test size(m(x_256)) == (1000, 1) - @test gradtest(m, x_256) - GC.safepoint() - GC.gc() - end + for mode in [:small, :base, :large] # :tiny, #,:huge, :giant, :gigantic] + m = ViT(mode) + @test size(m(x_256)) == (1000, 1) + @test gradtest(m, x_256) + GC.safepoint() + GC.gc() + end end