FluxML · CarloLucibello · Dec 13, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,9 @@
 
 See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a complete list of PRs merged before each release.
 
+## v0.15.3
+* Add `WeightNorm` normalization layer.
+
 ## v0.15.0  (December 2024)
 This release includes two **breaking changes**:
 - The recurrent layers have been thoroughly revised. See below and read the [documentation](https://fluxml.ai/Flux.jl/v0.15/guide/models/recurrence/) for details.

diff --git a/docs/src/reference/models/layers.md b/docs/src/reference/models/layers.md
@@ -126,6 +126,8 @@ AlphaDropout
 LayerNorm
 InstanceNorm
 GroupNorm
+WeightNorm
+Flux.remove_weight_norms
 Flux.normalise
 ```
 

diff --git a/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl b/ext/FluxAMDGPUExt/FluxAMDGPUExt.jl
@@ -3,8 +3,7 @@ module FluxAMDGPUExt
 import ChainRulesCore
 import ChainRulesCore: NoTangent
 import Flux
-import Flux: adapt_storage, fmap
-import Flux: DenseConvDims, Conv, ConvTranspose, conv, conv_reshape_bias
+import Flux: fmap, DenseConvDims, Conv, ConvTranspose, conv, conv_reshape_bias
 import NNlib
 using MLDataDevices
 using AMDGPU

diff --git a/src/Flux.jl b/src/Flux.jl
@@ -42,7 +42,7 @@ export Chain, Dense, Embedding, EmbeddingBag,
        SamePad, Conv, CrossCor, ConvTranspose, DepthwiseConv,
        AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
        Dropout, AlphaDropout,
-       LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
+       LayerNorm, BatchNorm, InstanceNorm, GroupNorm, WeightNorm,
        MultiHeadAttention,
        Upsample, PixelShuffle,
        fmap, cpu, gpu, f32, f64, f16, rand32, randn32, zeros32, ones32,

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -568,3 +568,117 @@
 See [`BatchNorm`](@ref), [`InstanceNorm`](@ref), [`GroupNorm`](@ref), and [`LayerNorm`](@ref).
 """
 hasaffine(l::Union{BatchNorm, InstanceNorm, LayerNorm, GroupNorm}) = l.affine
+
+struct WeightNorm{which, dims, L, G}
+    layer::L
+    g::G
+end
+@layer WeightNorm
+
+"""
+    WeightNorm(layer::L, which::Symbol = :weight; dims = -1)
+
+Apply weight normalization to a parameter given by `which` in a `layer`.
+
+``w = g \\frac{\\mathbf{v}}{\\lVert \\mathbf{v} \\rVert}``
+
+Decouples the magnitude of a weight tensor from its direction.
+By default, normalization is applied along the output channel `dim=-1`
+(equivalent to `dims=ndims(w)`).
+
+### Example
+
+```jldoctest
+julia> c = Conv((3,), 1 => 2);
+
+julia> wc = WeightNorm(c, :weight)
+WeightNorm(
+  Conv((3,), 1 => 2),                   # 8 parameters
+  3×1×1 Array{Float32,...},             # 3 parameters
+  3×1×2 Array{Float32,...},             # 6 parameters
+)                   # Total: 4 arrays, 17 parameters, 348 bytes.
+
+julia> x = ones(Float32, 12, 1, 1);
+
+julia> c(x) ≈ wc(x) # forward pass is the same as with the original layer
+true
+```
+
+# Reference
+
+Salimans & Kingma, _Weight Normalization_ (2016) <https://arxiv.org/abs/1602.07868>
+"""
+function WeightNorm(layer::L, which::Symbol = :weight; dims = -1) where L
+    hasfield(L, which) || throw(ArgumentError("`$L` does not have field `:$which`."))
+
+    x = getfield(layer, which)
+    iszero(x) && throw(ArgumentError(
+        "`$which` field for `$(typeof(layer))` is all zero, which will result in NaN."))
+
+    d = if dims isa Colon
+        1:ndims(x)
+    elseif dims == -1
+        dims = ndims(x)
+    else
+        dims
+    end
+
+    g = sqrt.(sum(abs2, x; dims) .+ eps(eltype(x)))
+    x ./= g # Store `v` in the original weights.
+    WeightNorm{which, dims, L, typeof(g)}(layer, g)
+end
+
+(w::WeightNorm)(x) = transform(w)(x)
+
+function transform(wn::WeightNorm{which, dims}) where {which, dims}
+    ϵ = eps(eltype(wn.g))
+    v = getfield(wn.layer, which)
+    n2 = sum(abs2, v; dims)
+    w = @. wn.g * v / sqrt(n2 + ϵ)
+
+    fields, ctor = Functors.functor(wn.layer)
+    return ctor(merge(
+        fields, NamedTuple{(which,)}((w,)),
+    ))
+end
+
+function Base.show(io::IO, w::WeightNorm{which, dims}) where {which, dims}
+    print(io, "WeightNorm(")
+    Base.show(io, w.layer)
+    print(io, ", :", which, "; dims=", dims)
+    print(io, ")")
+end
+
+"""
+    remove_weight_norms(x)
+
+Remove any [WeightNorm](@ref) parametrization in the model.
+
+### Example
+
+```jldoctest
+julia> model = Chain(
+    WeightNorm(Conv((3,), 1 => 2), :weight),
+    WeightNorm(Conv((3,), 2 => 2), :weight),
+)
+Chain(
+  WeightNorm(
+    Conv((3,), 1 => 2),                 # 8 parameters
+    3×1×1 Array{Float32,...},           # 3 parameters
+    3×1×2 Array{Float32,...},           # 6 parameters
+  ),
+  WeightNorm(
+    Conv((3,), 2 => 2),                 # 14 parameters
+    3×2×1 Array{Float32,...},           # 6 parameters
+    3×2×2 Array{Float32,...},           # 12 parameters
+  ),
+)                   # Total: 8 arrays, 49 parameters, 756 bytes.
+
+julia> Flux.remove_weight_norms(model)
+Chain(
+  Conv((3,), 1 => 2),                   # 8 parameters
+  Conv((3,), 2 => 2),                   # 14 parameters
+)                   # Total: 4 arrays, 22 parameters, 392 bytes.
+```
+"""
+remove_weight_norms(x) = fmap(transform, x; exclude=l -> l isa WeightNorm)
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -103,6 +103,7 @@ x = rand(Float32, 10)
 
 # Run forward
 res = rnn(x, h0)
+```
 """
 initialstates(rnn::RNNCell) = zeros_like(rnn.Wh, size(rnn.Wh, 2))
 

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -441,6 +441,62 @@ end
   @test_throws Exception GroupNorm(5, 5; active=:something_else)
 end
 
+@testset "WeightNorm" begin
+    x = rand(Float32, 1, 3)
+    mn = WeightNorm(Dense(1 => 2))
+    m = Flux.remove_weight_norms(mn)
+    @test m(x) ≈ mn(x)
+
+    @test_throws ArgumentError WeightNorm(m, :weights)
+    @test_throws "does not have field" WeightNorm(m, :weights)
+
+    @test_throws ArgumentError WeightNorm(m, :bias)
+    @test_throws "is all zero" WeightNorm(m, :bias)
+
+    og = (Zygote.gradient(m) do m
+        sum(m(x))
+    end)[1]
+    g = (Zygote.gradient(mn) do mn
+        sum(mn(x))
+    end)[1]
+
+    @test g.layer.weight ≢ nothing # Original weight acts as a direction `v`.
+    @test g.layer.bias ≢ nothing
+    @test g.g ≢ nothing
+
+    # Compare gradients with original layer.
+
+    v = mn.layer.weight
+    ϵ = eps(eltype(v))
+    n2 = sum(abs2, v; dims=2)
+
+    @test (og.weight .* v ./ sqrt.(n2 .+ ϵ)) ≈ g.g
+    @test (og.weight .* mn.g ./ n2 .- mn.g .* g.g .* v ./ n2.^2) ≈ g.layer.weight atol=1f-6
+
+    # Test WeightNorm removal.
+
+    om = Flux.remove_weight_norms(mn)
+    @test om isa Dense
+    @test om.weight ≈ m.weight
+    @test om.bias ≈ m.bias
+
+    # Test with Chain.
+
+    c = Chain(
+        WeightNorm(Conv((3,), 1 => 2)),
+        WeightNorm(Conv((3,), 2 => 2)),
+    )
+    @test c[1] isa WeightNorm
+    @test c[2] isa WeightNorm
+
+    oc = Flux.remove_weight_norms(c)
+    @test oc[1] isa Conv
+    @test oc[2] isa Conv
+
+    x = rand(Float32, 12, 1, 1)
+    @test c(x) ≈ oc(x)
+end
+
 @testset "second derivatives" begin
   m1 = Dropout(0.5)
   @test Zygote.hessian_reverse(sum∘m1, [1.0,2.0,3.0]) == zeros(3, 3)
-Original file line number
+Diff line change
@@ Expand Up / @@ -103,6 +103,7 @@ x = rand(Float32, 10) @@
     # Run forward
     res = rnn(x, h0)
+    ```
     """
     initialstates(rnn::RNNCell) = zeros_like(rnn.Wh, size(rnn.Wh, 2))
@@ Expand Down @@