Layernorm (#203)

OneAdder · milancurcic · web-flow · commit e68e6c2d8c51 · 2025-02-25T15:16:58.000-05:00
* layernorm: initial implementation

* layernorm: rename source file

* layernorm: remove redundant arguments

* layernorm: remove stack allocated arrays

* layernorm: rearrange into submodule

* layernorm: add error to stop in test

* layernorm: add gradient updates

* layernorm: public api

* layernorm: update tests

* layernorm: update cmake

* layernorm: use mold for temp allocation

* layernorm: rename to layernorm

* layernorm: allow usage of layernorm at the end

* layernorm: integration test for layernorm

* layernorm: memory allocation optimization

* Tidy up

* Bump version

* Add layernorm to the table of layers

---------

Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,6 +37,8 @@ add_library(neural-fortran
   src/nf/nf_input3d_layer_submodule.f90
   src/nf/nf_layer_constructors.f90
   src/nf/nf_layer_constructors_submodule.f90
+  src/nf/nf_layernorm.f90
+  src/nf/nf_layernorm_submodule.f90
   src/nf/nf_layer.f90
   src/nf/nf_layer_submodule.f90
   src/nf/nf_linear2d_layer.f90
diff --git a/README.md b/README.md
@@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
-| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
diff --git a/fpm.toml b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.19.0"
+version = "0.20.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "mcurcic@miami.edu"
diff --git a/src/nf.f90 b/src/nf.f90
@@ -11,7 +11,8 @@ module nf
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layernorm
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -17,7 +17,8 @@ module nf_layer_constructors
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layernorm
 
   interface input
 
@@ -222,15 +223,23 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
-  module function self_attention(num_heads) result(res)
-    !! Rank-2 (sequence_length, out_features) self attention constructor.
-    !! sequence_length and model_dimension are determined at layer initialization, based on the
-    !! output shape of the previous layer.
-    integer, intent(in) :: num_heads
-      !! Number of attention heads
-    type(layer) :: res
-      !! Resulting layer instance
-  end function self_attention
+    module function self_attention(num_heads) result(res)
+      !! Rank-2 (sequence_length, out_features) self attention constructor.
+      !! sequence_length and model_dimension are determined at layer initialization, based on the
+      !! output shape of the previous layer.
+      integer, intent(in) :: num_heads
+        !! Number of attention heads
+      type(layer) :: res
+        !! Resulting layer instance
+    end function self_attention
+
+    module function layernorm() result(res)
+      !! Layer Normalization
+      !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+      !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+      !! https://arxiv.org/abs/1607.06450v1
+      type(layer) :: res
+    end function layernorm
 
   end interface
 
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
+  module function layernorm() result(res)
+    type(layer) :: res
+
+    res % name = 'layernorm'
+    allocate(res % p, source=layernorm_layer())
+  end function layernorm
+
 end submodule nf_layer_constructors_submodule
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -46,7 +47,7 @@ pure module subroutine backward_1d(self, previous, gradient)
 
       type is(flatten_layer)
 
-        ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d
+        ! Upstream layers permitted: input2d, input3d, conv2d, layernorm, maxpool2d
         select type(prev_layer => previous % p)
           type is(input2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
@@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
       type is(self_attention_layer)
@@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
+      type is(layernorm_layer)
+
+        select type(prev_layer => previous % p)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
     end select
 
   end subroutine backward_2d
@@ -234,6 +249,8 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(reshape3d_layer)
@@ -250,26 +267,40 @@ module subroutine forward(self, input)
 
       type is(linear2d_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(self_attention_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(layernorm_layer)
+
+        ! Upstream layers permitted: linear2d, self_attention
+        select type(prev_layer => input % p)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -311,6 +342,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(self_attention_layer)
         allocate(output, source=this_layer % output)
+      type is(layernorm_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -354,8 +387,8 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or
-    ! self_attention layers is not known until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d,
+    ! self_attention or layernorm layers is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
@@ -367,6 +400,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(self_attention_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(layernorm_layer)
+        self % layer_shape = shape(this_layer % output)
       type is(maxpool2d_layer)
         self % layer_shape = shape(this_layer % output)
     end select
@@ -425,6 +460,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = this_layer % get_num_params()
       type is (self_attention_layer)
         num_params = this_layer % get_num_params()
+      type is (layernorm_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -458,6 +495,8 @@ module function get_params(self) result(params)
         params = this_layer % get_params()
       type is (self_attention_layer)
         params = this_layer % get_params()
+      type is (layernorm_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -491,6 +530,8 @@ module function get_gradients(self) result(gradients)
         gradients = this_layer % get_gradients()
       type is (self_attention_layer)
         gradients = this_layer % get_gradients()
+      type is (layernorm_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -549,6 +590,9 @@ module subroutine set_params(self, params)
       type is (self_attention_layer)
         call this_layer % set_params(params)
 
+      type is (layernorm_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &
diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
@@ -0,0 +1,92 @@
+module nf_layernorm_layer
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: layernorm_layer
+
+  type, extends(base_layer) :: layernorm_layer
+    !! Layer Normalization
+    !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+    !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+    !! https://arxiv.org/abs/1607.06450v1
+    integer :: sequence_length
+    integer :: model_dimension
+
+    real :: eps
+    real, allocatable :: gamma(:)
+    real, allocatable :: beta(:)
+
+    real, allocatable :: d_gamma(:)
+    real, allocatable :: d_beta(:)
+    real, allocatable :: gradient(:, :)
+
+    real, allocatable :: mu(:, :)
+    real, allocatable :: sigma(:)
+
+    real, allocatable :: output(:, :)
+
+    ! temp storages
+    real, allocatable, private :: normalized(:, :)
+    real, allocatable, private :: one_over_sigma(:, :)
+    real, allocatable, private :: gradient_by_gamma_over_sigma(:, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
+  end type layernorm_layer
+
+  interface layernorm_layer
+    module function layernorm_layer_cons() &
+      result(res)
+      type(layernorm_layer) :: res
+    end function layernorm_layer_cons
+  end interface layernorm_layer
+
+  interface
+    pure module subroutine forward(self, input)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    module subroutine init(self, input_shape)
+      class(layernorm_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+      class(layernorm_layer), intent(in) :: self
+      integer :: num_params
+    end function get_num_params
+
+
+    module function get_params(self) result(params)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+
+    module function get_gradients(self) result(gradients)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+
+    module subroutine set_params(self, params)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
+  end interface
+end module nf_layernorm_layer
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90