modern-fortran · milancurcic · Mar 5, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 19, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -43,6 +43,8 @@ add_library(neural-fortran
   src/nf/nf_layer_submodule.f90
   src/nf/nf_linear2d_layer.f90
   src/nf/nf_linear2d_layer_submodule.f90
+  src/nf/nf_embedding_layer.f90
+  src/nf/nf_embedding_layer_submodule.f90
   src/nf/nf_loss.f90
   src/nf/nf_loss_submodule.f90
   src/nf/nf_maxpool2d_layer.f90

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
+| Embedding | `embedding` | n/a | 2 | ✅ | ✅ |
 | Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |

diff --git a/src/nf.f90 b/src/nf.f90
@@ -6,13 +6,14 @@ module nf
     conv2d, &
     dense, &
     dropout, &
+    embedding, &
     flatten, &
     input, &
+    layernorm, &
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention, &
-    layernorm
+    self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network

diff --git a/src/nf/nf_embedding_layer.f90 b/src/nf/nf_embedding_layer.f90
@@ -0,0 +1,98 @@
+module nf_embedding_layer
+
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: embedding_layer
+
+  type, extends(base_layer) :: embedding_layer
+    !! Embedding Layer
+    !! Stores inputs as a trainable lookup table. Inputs are
+    !! integer indicies in a dictionary of `vocab_size`.
+    !! This layer converts them into a table of shape
+    !! (`sequence_length`, `model_dimension`)
+    integer :: sequence_length, vocab_size, model_dimension
+    integer :: positional
+
+    real, allocatable :: weights(:, :)
+    real, allocatable :: output(:, :)
+    real, allocatable :: dw(:, :) ! weight gradients
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: positional_trigonometric
+    procedure :: positional_absolute
+    procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
+
+  end type embedding_layer
+
+  interface embedding_layer
+    module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+      integer, intent(in) :: vocab_size, model_dimension
+      integer, optional :: positional
+      type(embedding_layer) :: res
+    end function embedding_layer_cons
+  end interface embedding_layer
+
+  interface
+    pure module subroutine forward(self, input)
+      !! Get vectors by indicis in the dictionary
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      !! Update gradient at `input` indices
+      !! dw_i = W_i + d_output_i
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    pure module subroutine positional_trigonometric(self, pos)
+      !! Sum embedding with positional info (trigonometric, not trianable)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_trigonometric
+
+    pure module subroutine positional_absolute(self, pos)
+      !! Sum embedding with absolute position
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_absolute
+
+    module subroutine init(self, input_shape)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+       class(embedding_layer), intent(in) :: self
+       integer :: num_params
+    end function get_num_params
+
+    module function get_params(self) result(params)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+    module function get_gradients(self) result(gradients)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+    module subroutine set_params(self, params)
+      class(embedding_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
+  end interface
+end module nf_embedding_layer
diff --git a/src/nf/nf_embedding_layer_submodule.f90 b/src/nf/nf_embedding_layer_submodule.f90
@@ -0,0 +1,137 @@
+#define NONE 0
+#define TRIGONOMETRIC 1
+#define ABSOLUTE 2
+
+submodule(nf_embedding_layer) nf_embedding_layer_submodule
+  use nf_base_layer, only: base_layer
+  implicit none
+contains
+  module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+    integer, intent(in) :: vocab_size, model_dimension
+    integer, optional :: positional
+    type(embedding_layer) :: res
+
+    res % vocab_size = vocab_size
+    res % model_dimension = model_dimension
+    if (.not. present(positional)) then
+      res % positional = NONE
+    else
+      res % positional = positional
+    end if
+  end function embedding_layer_cons
+
+  module subroutine init(self, input_shape)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % sequence_length = input_shape(1)
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+
+    allocate(self % weights(self % vocab_size, self % model_dimension))
+    self % weights = 0.1
+
+    allocate(self % dw(self % vocab_size, self % model_dimension))
+    self % dw = 0.0
+  end subroutine init
+
+  pure module subroutine forward(self, input)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    integer :: i, index
+
+    do concurrent(i = 1: self % sequence_length)
+      index = input(i)
+      if (index > size(self % weights, 1)) then
+        index = 1
+      elseif (index == 0) then
+        index = 1
+      end if
+
+      self % output(i, :) = self % weights(index, :)
+
+      if (self % positional == TRIGONOMETRIC) then
+        call self % positional_trigonometric(i)
+      elseif (self % positional == ABSOLUTE) then
+        call self % positional_absolute(i)
+      end if
+    end do
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    real, intent(in) :: gradient(:, :)
+    integer :: i
+
+    do concurrent(i = 1: self % sequence_length)
+      self % dw(input(i), :) = self % dw(input(i), :) + gradient(i, :)
+    end do
+  end subroutine backward
+
+  pure module subroutine positional_trigonometric(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+    real :: theta
+
+    do concurrent(i = 1: floor(real(self % model_dimension) / 2))
+      theta = (pos - 1) / 10000 ** (real(2 * (i-1)) / self % model_dimension)
+      self % output(pos, 2 * i - 1) = self % output(pos, 2 * i - 1) + sin(theta)
+      self % output(pos, 2 * i) = self % output(pos, 2 * i) + cos(theta)
+    end do
+  end subroutine positional_trigonometric
+
+  pure module subroutine positional_absolute(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+
+    do concurrent(i = 1: self % model_dimension)
+      self % output(pos, i) = self % output(pos, i) + pos - 1
+    end do
+  end subroutine positional_absolute
+
+  pure module function get_num_params(self) result(num_params)
+    class(embedding_layer), intent(in) :: self
+    integer :: num_params
+    num_params = self % vocab_size * self % model_dimension
+  end function get_num_params
+
+  module function get_params(self) result(params)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+    real, pointer :: w_(:) => null()
+
+    w_(1: product(shape(self % weights))) => self % weights
+    params = w_
+  end function get_params
+
+  module function get_gradients(self) result(gradients)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+    real, pointer :: dw_(:) => null()
+
+    dw_(1: product(shape(self % dw))) => self % dw
+    gradients = dw_
+  end function get_gradients
+
+  module subroutine set_params(self, params)
+    class(embedding_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    real, pointer :: p_(:,:) => null()
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    associate(n => self % vocab_size * self % model_dimension)
+      ! reshape the weights
+      p_(1:self % vocab_size, 1:self % model_dimension) => params(1 : n)
+      self % weights = p_
+    end associate
+
+  end subroutine set_params
+end submodule nf_embedding_layer_submodule
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -18,6 +18,7 @@ module nf_layer_constructors
     maxpool2d, &
     reshape, &
     self_attention, &
+    embedding, &
     layernorm
 
   interface input
@@ -233,6 +234,23 @@ module function self_attention(num_heads) result(res)
         !! Resulting layer instance
     end function self_attention
 
+    module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
+      !! Embedding layer constructor.
+      !!
+      !! This layer is for inputting token indices from the dictionary to the network.
+      !! Works as a trainable lookup table that converts each index into a vector.
+      !! Embedding layer must be the first layer in a network.
+      integer, intent(in) :: sequence_length
+        !! max len of input sequence  
+      integer, intent(in) :: vocab_size
+        !! length of token vocabulary
+      integer, intent(in) :: model_dimension
+        !! size of target embeddings
+      integer, optional, intent(in) :: positional
+        !! positional encoding
+      type(layer) :: res
+    end function embedding
+
     module function layernorm() result(res)
       !! Layer Normalization
       !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta

diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_embedding_layer, only: embedding_layer
   use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
@@ -172,6 +173,7 @@ module function linear2d(out_features) result(res)
 
   end function linear2d
 
+
   module function self_attention(num_heads) result(res)
     integer, intent(in) :: num_heads
     type(layer) :: res
@@ -180,9 +182,26 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
-  module function layernorm() result(res)
+
+  module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
+    integer, intent(in) :: sequence_length, vocab_size, model_dimension
+    integer, optional, intent(in) :: positional
     type(layer) :: res
+    type(embedding_layer) :: embedding_layer_instance
+
+    embedding_layer_instance = embedding_layer(vocab_size, model_dimension, positional)
+    call embedding_layer_instance % init([sequence_length])
+    res % name = 'embedding'
+    res % layer_shape = [sequence_length, model_dimension]
+    res % input_layer_shape = [integer ::]
+    allocate(res % p, source=embedding_layer_instance)
+    res % initialized = .true.
+
+  end function embedding
+
 
+  module function layernorm() result(res)
+    type(layer) :: res
     res % name = 'layernorm'
     allocate(res % p, source=layernorm_layer())
   end function layernorm