diff --git a/CMakeLists.txt b/CMakeLists.txt
index 906bdd0d..beb3389d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,8 @@ add_library(neural-fortran
   src/nf/nf_layer_submodule.f90
   src/nf/nf_linear2d_layer.f90
   src/nf/nf_linear2d_layer_submodule.f90
+  src/nf/nf_embedding_layer.f90
+  src/nf/nf_embedding_layer_submodule.f90
   src/nf/nf_loss.f90
   src/nf/nf_loss_submodule.f90
   src/nf/nf_maxpool2d_layer.f90
diff --git a/README.md b/README.md
index 9fe3fab0..e94296a3 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
+| Embedding | `embedding` | n/a | 2 | ✅ | ✅ |
 | Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
diff --git a/src/nf.f90 b/src/nf.f90
index d089c4ac..181183e7 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -6,13 +6,14 @@ module nf
     conv2d, &
     dense, &
     dropout, &
+    embedding, &
     flatten, &
     input, &
+    layernorm, &
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention, &
-    layernorm
+    self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_embedding_layer.f90 b/src/nf/nf_embedding_layer.f90
new file mode 100644
index 00000000..94a868a5
--- /dev/null
+++ b/src/nf/nf_embedding_layer.f90
@@ -0,0 +1,98 @@
+module nf_embedding_layer
+
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: embedding_layer
+
+  type, extends(base_layer) :: embedding_layer
+    !! Embedding Layer
+    !! Stores inputs as a trainable lookup table. Inputs are
+    !! integer indicies in a dictionary of `vocab_size`.
+    !! This layer converts them into a table of shape
+    !! (`sequence_length`, `model_dimension`)
+    integer :: sequence_length, vocab_size, model_dimension
+    integer :: positional
+
+    real, allocatable :: weights(:, :)
+    real, allocatable :: output(:, :)
+    real, allocatable :: dw(:, :) ! weight gradients
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: positional_trigonometric
+    procedure :: positional_absolute
+    procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
+
+  end type embedding_layer
+
+  interface embedding_layer
+    module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+      integer, intent(in) :: vocab_size, model_dimension
+      integer, optional :: positional
+      type(embedding_layer) :: res
+    end function embedding_layer_cons
+  end interface embedding_layer
+
+  interface
+    pure module subroutine forward(self, input)
+      !! Get vectors by indicis in the dictionary
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      !! Update gradient at `input` indices
+      !! dw_i = W_i + d_output_i
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input(:)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    pure module subroutine positional_trigonometric(self, pos)
+      !! Sum embedding with positional info (trigonometric, not trianable)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_trigonometric
+
+    pure module subroutine positional_absolute(self, pos)
+      !! Sum embedding with absolute position
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: pos
+    end subroutine positional_absolute
+
+    module subroutine init(self, input_shape)
+      class(embedding_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+       class(embedding_layer), intent(in) :: self
+       integer :: num_params
+    end function get_num_params
+
+    module function get_params(self) result(params)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+    module function get_gradients(self) result(gradients)
+      class(embedding_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+    module subroutine set_params(self, params)
+      class(embedding_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
+  end interface
+end module nf_embedding_layer
diff --git a/src/nf/nf_embedding_layer_submodule.f90 b/src/nf/nf_embedding_layer_submodule.f90
new file mode 100644
index 00000000..83992b22
--- /dev/null
+++ b/src/nf/nf_embedding_layer_submodule.f90
@@ -0,0 +1,137 @@
+#define NONE 0
+#define TRIGONOMETRIC 1
+#define ABSOLUTE 2
+
+submodule(nf_embedding_layer) nf_embedding_layer_submodule
+  use nf_base_layer, only: base_layer
+  implicit none
+contains
+  module function embedding_layer_cons(vocab_size, model_dimension, positional) result(res)
+    integer, intent(in) :: vocab_size, model_dimension
+    integer, optional :: positional
+    type(embedding_layer) :: res
+
+    res % vocab_size = vocab_size
+    res % model_dimension = model_dimension
+    if (.not. present(positional)) then
+      res % positional = NONE
+    else
+      res % positional = positional
+    end if
+  end function embedding_layer_cons
+
+  module subroutine init(self, input_shape)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % sequence_length = input_shape(1)
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+
+    allocate(self % weights(self % vocab_size, self % model_dimension))
+    self % weights = 0.1
+
+    allocate(self % dw(self % vocab_size, self % model_dimension))
+    self % dw = 0.0
+  end subroutine init
+
+  pure module subroutine forward(self, input)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    integer :: i, index
+
+    do concurrent(i = 1: self % sequence_length)
+      index = input(i)
+      if (index > size(self % weights, 1)) then
+        index = 1
+      elseif (index == 0) then
+        index = 1
+      end if
+
+      self % output(i, :) = self % weights(index, :)
+
+      if (self % positional == TRIGONOMETRIC) then
+        call self % positional_trigonometric(i)
+      elseif (self % positional == ABSOLUTE) then
+        call self % positional_absolute(i)
+      end if
+    end do
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    real, intent(in) :: gradient(:, :)
+    integer :: i
+
+    do concurrent(i = 1: self % sequence_length)
+      self % dw(input(i), :) = self % dw(input(i), :) + gradient(i, :)
+    end do
+  end subroutine backward
+
+  pure module subroutine positional_trigonometric(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+    real :: theta
+
+    do concurrent(i = 1: floor(real(self % model_dimension) / 2))
+      theta = (pos - 1) / 10000 ** (real(2 * (i-1)) / self % model_dimension)
+      self % output(pos, 2 * i - 1) = self % output(pos, 2 * i - 1) + sin(theta)
+      self % output(pos, 2 * i) = self % output(pos, 2 * i) + cos(theta)
+    end do
+  end subroutine positional_trigonometric
+
+  pure module subroutine positional_absolute(self, pos)
+    class(embedding_layer), intent(in out) :: self
+    integer, intent(in) :: pos
+    integer :: i
+
+    do concurrent(i = 1: self % model_dimension)
+      self % output(pos, i) = self % output(pos, i) + pos - 1
+    end do
+  end subroutine positional_absolute
+
+  pure module function get_num_params(self) result(num_params)
+    class(embedding_layer), intent(in) :: self
+    integer :: num_params
+    num_params = self % vocab_size * self % model_dimension
+  end function get_num_params
+
+  module function get_params(self) result(params)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+    real, pointer :: w_(:) => null()
+
+    w_(1: product(shape(self % weights))) => self % weights
+    params = w_
+  end function get_params
+
+  module function get_gradients(self) result(gradients)
+    class(embedding_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+    real, pointer :: dw_(:) => null()
+
+    dw_(1: product(shape(self % dw))) => self % dw
+    gradients = dw_
+  end function get_gradients
+
+  module subroutine set_params(self, params)
+    class(embedding_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    real, pointer :: p_(:,:) => null()
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    associate(n => self % vocab_size * self % model_dimension)
+      ! reshape the weights
+      p_(1:self % vocab_size, 1:self % model_dimension) => params(1 : n)
+      self % weights = p_
+    end associate
+
+  end subroutine set_params
+end submodule nf_embedding_layer_submodule
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
index ce00b6bc..a13eee4a 100644
--- a/src/nf/nf_layer_constructors.f90
+++ b/src/nf/nf_layer_constructors.f90
@@ -18,6 +18,7 @@ module nf_layer_constructors
     maxpool2d, &
     reshape, &
     self_attention, &
+    embedding, &
     layernorm
 
   interface input
@@ -233,6 +234,23 @@ module function self_attention(num_heads) result(res)
         !! Resulting layer instance
     end function self_attention
 
+    module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
+      !! Embedding layer constructor.
+      !!
+      !! This layer is for inputting token indices from the dictionary to the network.
+      !! Works as a trainable lookup table that converts each index into a vector.
+      !! Embedding layer must be the first layer in a network.
+      integer, intent(in) :: sequence_length
+        !! max len of input sequence  
+      integer, intent(in) :: vocab_size
+        !! length of token vocabulary
+      integer, intent(in) :: model_dimension
+        !! size of target embeddings
+      integer, optional, intent(in) :: positional
+        !! positional encoding
+      type(layer) :: res
+    end function embedding
+
     module function layernorm() result(res)
       !! Layer Normalization
       !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
index 5c2e8893..cd829f59 100644
--- a/src/nf/nf_layer_constructors_submodule.f90
+++ b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_embedding_layer, only: embedding_layer
   use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
@@ -172,6 +173,7 @@ module function linear2d(out_features) result(res)
 
   end function linear2d
 
+
   module function self_attention(num_heads) result(res)
     integer, intent(in) :: num_heads
     type(layer) :: res
@@ -180,9 +182,26 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
-  module function layernorm() result(res)
+
+  module function embedding(sequence_length, vocab_size, model_dimension, positional) result(res)
+    integer, intent(in) :: sequence_length, vocab_size, model_dimension
+    integer, optional, intent(in) :: positional
     type(layer) :: res
+    type(embedding_layer) :: embedding_layer_instance
+
+    embedding_layer_instance = embedding_layer(vocab_size, model_dimension, positional)
+    call embedding_layer_instance % init([sequence_length])
+    res % name = 'embedding'
+    res % layer_shape = [sequence_length, model_dimension]
+    res % input_layer_shape = [integer ::]
+    allocate(res % p, source=embedding_layer_instance)
+    res % initialized = .true.
+
+  end function embedding
+
 
+  module function layernorm() result(res)
+    type(layer) :: res
     res % name = 'layernorm'
     allocate(res % p, source=layernorm_layer())
   end function layernorm
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index a3b42434..afc64746 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_embedding_layer, only: embedding_layer
   use nf_layernorm_layer, only: layernorm_layer
   use nf_optimizers, only: optimizer_base_type
 
@@ -61,6 +62,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(embedding_layer)
+            call this_layer % backward(prev_layer % output, gradient)
           type is(layernorm_layer)
             call this_layer % backward(prev_layer % output, gradient)
         end select
@@ -83,6 +86,8 @@ pure module subroutine backward_2d(self, previous, gradient)
         select type(prev_layer => previous % p)
           type is(input2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(embedding_layer)
+            call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
@@ -96,6 +101,8 @@ pure module subroutine backward_2d(self, previous, gradient)
         select type(prev_layer => previous % p)
           type is(input2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(embedding_layer)
+            call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
@@ -271,6 +278,8 @@ module subroutine forward(self, input)
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(embedding_layer)
+            call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
@@ -285,6 +294,8 @@ module subroutine forward(self, input)
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(embedding_layer)
+            call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
@@ -338,6 +349,8 @@ pure module subroutine get_output_2d(self, output)
 
       type is(input2d_layer)
         allocate(output, source=this_layer % output)
+      type is(embedding_layer)
+        allocate(output, source=this_layer % output)
       type is(linear2d_layer)
         allocate(output, source=this_layer % output)
       type is(self_attention_layer)
@@ -460,6 +473,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = this_layer % get_num_params()
       type is (self_attention_layer)
         num_params = this_layer % get_num_params()
+      type is (embedding_layer)
+        num_params = this_layer % get_num_params()
       type is (layernorm_layer)
         num_params = this_layer % get_num_params()
       class default
@@ -495,6 +510,8 @@ module function get_params(self) result(params)
         params = this_layer % get_params()
       type is (self_attention_layer)
         params = this_layer % get_params()
+      type is (embedding_layer)
+        params = this_layer % get_params()
       type is (layernorm_layer)
         params = this_layer % get_params()
       class default
@@ -530,6 +547,8 @@ module function get_gradients(self) result(gradients)
         gradients = this_layer % get_gradients()
       type is (self_attention_layer)
         gradients = this_layer % get_gradients()
+      type is (embedding_layer)
+        gradients = this_layer % get_gradients()
       type is (layernorm_layer)
         gradients = this_layer % get_gradients()
       class default
@@ -589,6 +608,8 @@ module subroutine set_params(self, params)
 
       type is (self_attention_layer)
         call this_layer % set_params(params)
+      type is (embedding_layer)
+        call this_layer % set_params(params)
 
       type is (layernorm_layer)
         call this_layer % set_params(params)
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index 5916924e..53d3c07d 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -32,17 +32,19 @@ module nf_network
 
     procedure, private :: evaluate_batch_1d
     procedure, private :: forward_1d
+    procedure, private :: forward_1d_int
     procedure, private :: forward_2d
     procedure, private :: forward_3d
     procedure, private :: predict_1d
+    procedure, private :: predict_1d_int
     procedure, private :: predict_2d
     procedure, private :: predict_3d
     procedure, private :: predict_batch_1d
     procedure, private :: predict_batch_3d
 
     generic :: evaluate => evaluate_batch_1d
-    generic :: forward => forward_1d, forward_2d, forward_3d
-    generic :: predict => predict_1d, predict_2d, predict_3d
+    generic :: forward => forward_1d, forward_1d_int, forward_2d, forward_3d
+    generic :: predict => predict_1d, predict_1d_int, predict_2d, predict_3d
     generic :: predict_batch => predict_batch_1d, predict_batch_3d
 
   end type network
@@ -95,6 +97,12 @@ module subroutine forward_1d(self, input)
         !! 1-d input data
     end subroutine forward_1d
 
+    module subroutine forward_1d_int(self, input)
+      !! Same as `forward_1d` except `integer`
+      class(network), intent(in out) :: self
+      integer, intent(in) :: input(:)
+    end subroutine forward_1d_int
+
     module subroutine forward_2d(self, input)
       !! Apply a forward pass through the network.
       !!
@@ -137,6 +145,13 @@ module function predict_1d(self, input) result(res)
         !! Output of the network
     end function predict_1d
 
+    module function predict_1d_int(self, input) result(res)
+      !! Same as `predict_1d` except `integer`
+      class(network), intent(in out) :: self
+      integer, intent(in) :: input(:)
+      real, allocatable :: res(:)
+    end function predict_1d_int
+
     module function predict_2d(self, input) result(res)
       !! Return the output of the network given the input 1-d array.
       class(network), intent(in out) :: self
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index a6b7657c..efd98327 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -11,6 +11,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_embedding_layer, only: embedding_layer
   use nf_layernorm_layer, only: layernorm_layer
   use nf_layer, only: layer
   use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape
@@ -47,7 +48,7 @@ module function network_from_layers(layers) result(res)
       error stop 'Error: A network must have at least 2 layers.'
 
     ! The first layer must be an input layer
-    if (.not. layers(1) % name == 'input') &
+    if (.not. layers(1) % name == 'input' .and. .not. layers(1) % name == 'embedding') &
       error stop 'Error: First layer in the network must be an input layer.'
 
     !TODO Ensure that the layers are in allowed sequence:
@@ -210,8 +211,9 @@ module subroutine forward_1d(self, input)
     integer :: n
 
     ! Set the input array into the input layer
-    select type(input_layer => self % layers(1) % p); type is(input1d_layer)
-      call input_layer % set(input)
+    select type(input_layer => self % layers(1) % p)
+      type is(input1d_layer)
+        call input_layer % set(input)
     end select
 
     do n = 2, size(self % layers)
@@ -220,6 +222,21 @@ module subroutine forward_1d(self, input)
 
   end subroutine forward_1d
 
+  module subroutine forward_1d_int(self, input)
+    class(network), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    integer :: n
+
+    select type(input_layer => self % layers(1) % p)
+      type is(embedding_layer)
+        call input_layer % forward(input)
+    end select
+
+    do n = 2, size(self % layers)
+      call self % layers(n) % forward(self % layers(n - 1))
+    end do
+
+  end subroutine forward_1d_int
 
   module subroutine forward_2d(self, input)
     class(network), intent(in out) :: self
@@ -284,6 +301,31 @@ module function predict_1d(self, input) result(res)
 
   end function predict_1d
 
+  module function predict_1d_int(self, input) result(res)
+    class(network), intent(in out) :: self
+    integer, intent(in) :: input(:)
+    real, allocatable :: res(:)
+    integer :: n, num_layers
+
+    num_layers = size(self % layers)
+
+    call self % set_training_mode(.false.)
+    call self % forward(input)
+    call self % set_training_mode(.true.)
+
+    select type(output_layer => self % layers(num_layers) % p)
+      type is(dense_layer)
+        res = output_layer % output
+      type is(dropout_layer)
+        res = output_layer % output
+      type is(flatten_layer)
+        res = output_layer % output
+      class default
+        error stop 'network % output not implemented for ' // &
+          trim(self % layers(num_layers) % name) // ' layer'
+    end select
+
+  end function predict_1d_int
 
   module function predict_2d(self, input) result(res)
     class(network), intent(in out) :: self
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 46d349c1..6982ddd4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,6 +12,7 @@ foreach(execid
   insert_flatten
   reshape_layer
   multihead_attention_layer
+  embedding_layer
   layernorm
   dense_network
   get_set_network_params
diff --git a/test/test_embedding_layer.f90 b/test/test_embedding_layer.f90
new file mode 100644
index 00000000..99b7fca6
--- /dev/null
+++ b/test/test_embedding_layer.f90
@@ -0,0 +1,133 @@
+program test_embedding_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_embedding_layer, only: embedding_layer
+  use nf_layer, only: layer
+  use nf_layer_constructors, only: embedding_constructor => embedding
+  implicit none
+
+  logical :: ok = .true.
+  integer :: sample_input(3) = [2, 1, 3]
+
+  call test_simple(ok, sample_input)
+  call test_positional_trigonometric(ok, sample_input)
+  call test_positional_absolute(ok, sample_input)
+
+  if (ok) then
+    print '(a)', 'test_embedding_layer: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_embedding_layer: One or more tests failed.'
+    error stop 1
+  end if
+
+contains
+  subroutine test_simple(ok, sample_input)
+    logical, intent(in out) :: ok
+    integer, intent(in) :: sample_input(:)
+
+    real :: sample_gradient(3, 2) = reshape([0.1, 0.2, 0.3, 0.4, 0.6, 0.6], [3, 2])
+    real :: output_flat(6)
+    real :: expected_output_flat(6) = reshape([0.3, 0.1, 0.5, 0.4, 0.2, 0.6], [6])
+    real :: dw_flat(8)
+    real :: expected_dw_flat(8) = reshape([0.2, 0.1, 0.3, 0., 0.6, 0.4, 0.6, 0.], [8])
+    type(embedding_layer) :: embedding
+
+    embedding = embedding_layer(vocab_size=4, model_dimension=2)
+    call embedding % init([3])
+    embedding % weights = reshape([0.1, 0.3, 0.5, 0.7, 0.2, 0.4, 0.6, 0.8], [4, 2])
+
+    call embedding % forward(sample_input)
+
+    output_flat = reshape(embedding % output, [6])
+    if (.not. all(output_flat.eq.expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+
+    call embedding % backward(sample_input, sample_gradient)
+    dw_flat = reshape(embedding % dw, shape(dw_flat))
+    if (.not. all(dw_flat.eq.expected_dw_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect dw values.. failed'
+    end if
+  end subroutine test_simple
+
+  subroutine test_positional_trigonometric(ok, sample_input)
+    logical, intent(in out) :: ok
+    integer, intent(in) :: sample_input(:)
+
+    real :: output_flat(12)
+    real :: expected_output_flat(12) = reshape([&
+        0.3, 0.941471, 1.4092975,&
+        1.3, 0.64030236, 0.08385316,&
+        0.3, 0.10999984, 0.51999867,&
+        1.3, 1.09995, 1.4998&
+    ], [12])
+    type(embedding_layer) :: embedding
+
+    real :: theta
+    integer :: i, pos
+
+    embedding = embedding_layer(vocab_size=5, model_dimension=4, positional=1)
+    call embedding % init([3])
+    embedding % weights = reshape([&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2&
+    ], [5, 4])
+
+    call embedding % forward(sample_input)
+
+    output_flat = reshape(embedding % output, [12])
+    if (.not. all(abs(output_flat - expected_output_flat) <= (1e-06 + 1e-05 * abs(expected_output_flat)))) then
+      ok = .false.
+      write(stderr, '(a)') 'trigonometric positional encoding returned incorrect values.. failed'
+    end if
+  end subroutine test_positional_trigonometric
+
+  subroutine test_positional_absolute(ok, sample_input)
+    logical, intent(in out) :: ok
+    integer, intent(in) :: sample_input(:)
+
+    real :: output_flat(12)
+    real :: expected_output_flat(12) = reshape([&
+        0.3, 1.1, 2.5,&
+        0.3, 1.1, 2.5,&
+        0.3, 1.1, 2.5,&
+        0.3, 1.1, 2.5&
+    ], [12])
+    type(embedding_layer) :: embedding
+
+    real :: theta
+    integer :: i, pos
+
+    embedding = embedding_layer(vocab_size=5, model_dimension=4, positional=2)
+    call embedding % init([3])
+    embedding % weights = reshape([&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2,&
+        0.1, 0.3, 0.5, 0.7, 0.2&
+    ], [5, 4])
+
+    call embedding % forward(sample_input)
+
+    output_flat = reshape(embedding % output, [12])
+    if (.not. all(abs(output_flat - expected_output_flat) <= (1e-06 + 1e-05 * abs(expected_output_flat)))) then
+      ok = .false.
+      write(stderr, '(a)') 'absolute positional encoding returned incorrect values.. failed'
+    end if
+  end subroutine test_positional_absolute
+
+  subroutine test_embedding_constructor(ok, sample_input)
+    logical, intent(in out) :: ok
+    integer, intent(in) :: sample_input(:)
+
+    type(layer) :: embedding_constructed
+
+    embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4)
+    embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=0)
+    embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=1)
+    embedding_constructed = embedding_constructor(sequence_length=3, vocab_size=5, model_dimension=4, positional=2)
+  end subroutine test_embedding_constructor
+end program test_embedding_layer