modern-fortran
diff --git a/‎CMakeLists.txt
+4 b/‎CMakeLists.txt
+4
diff --git a/‎README.md
+2-1 b/‎README.md
+2-1
diff --git a/‎example/CMakeLists.txt
+1 b/‎example/CMakeLists.txt
+1
diff --git a/‎example/mha_simple.f90
+37 b/‎example/mha_simple.f90
+37
diff --git a/‎src/nf.f90
+11-1 b/‎src/nf.f90
+11-1
diff --git a/‎src/nf/nf_cross_attention_layer.f90
+66 b/‎src/nf/nf_cross_attention_layer.f90
+66
diff --git a/‎src/nf/nf_layer_constructors.f90
+20-1 b/‎src/nf/nf_layer_constructors.f90
+20-1
diff --git a/‎src/nf/nf_layer_constructors_submodule.f90
+9 b/‎src/nf/nf_layer_constructors_submodule.f90
+9
diff --git a/‎src/nf/nf_layer_submodule.f90
+45-2 b/‎src/nf/nf_layer_submodule.f90
+45-2
@@ -20,6 +20,7 @@ add_library(neural-fortran
   src/nf/nf_base_layer.f90
   src/nf/nf_conv2d_layer.f90
   src/nf/nf_conv2d_layer_submodule.f90
+  src/nf/nf_cross_attention_layer.f90
   src/nf/nf_datasets.f90
   src/nf/nf_datasets_submodule.f90
   src/nf/nf_datasets_mnist.f90
@@ -45,6 +46,8 @@ add_library(neural-fortran
   src/nf/nf_maxpool2d_layer.f90
   src/nf/nf_maxpool2d_layer_submodule.f90
   src/nf/nf_metrics.f90
+  src/nf/nf_multihead_attention.f90
+  src/nf/nf_multihead_attention_submodule.f90
   src/nf/nf_network.f90
   src/nf/nf_network_submodule.f90
   src/nf/nf_optimizers.f90
@@ -53,6 +56,7 @@ add_library(neural-fortran
   src/nf/nf_random.f90
   src/nf/nf_reshape_layer.f90
   src/nf/nf_reshape_layer_submodule.f90
+  src/nf/nf_self_attention_layer.f90
   src/nf/io/nf_io_binary.f90
   src/nf/io/nf_io_binary_submodule.f90
   src/nf/nf_dropout_layer.f90
 
@@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d`, `linear2d` | 2 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
 (*) See Issue [#145](https://github.com/modern-fortran/neural-fortran/issues/145) regarding non-converging CNN training on the MNIST dataset.
 
@@ -6,6 +6,7 @@ foreach(execid
   simple
   sine
   quadratic
+  mha_simple
 )
   add_executable(${execid} ${execid}.f90)
   target_link_libraries(${execid} PRIVATE
 
@@ -0,0 +1,37 @@
+program mha_simple
+  use nf, only: dense, input, network, sgd, self_attention, flatten
+  implicit none
+  type(network) :: net
+  real, allocatable :: x(:, :), y(:)
+  integer, parameter :: num_iterations = 500
+  integer :: n
+
+  print '("Simple")'
+  print '(60("="))'
+
+  net = network([ &
+    input(3, 8), &
+    self_attention(4), &
+    flatten(), &
+    dense(2) &
+  ])
+
+  call net % print_info()
+
+  allocate(x(3, 8))
+  call random_number(x)
+
+  y = [0.123456, 0.246802]
+
+  do n = 0, num_iterations
+
+    call net % forward(x)
+    call net % backward(y)
+    call net % update(optimizer=sgd(learning_rate=1.))
+
+    if (mod(n, 50) == 0) &
+      print '(i4,2(3x,f8.6))', n, net % predict(x)
+
+  end do
+
+end program mha_simple
@@ -3,7 +3,15 @@ module nf
   use nf_datasets_mnist, only: label_digits, load_mnist
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
-    conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
+    conv2d, &
+    dense, &
+    dropout, &
+    flatten, &
+    input, &
+    linear2d, &
+    maxpool2d, &
+    reshape, &
+    self_attention
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
@@ -12,4 +20,6 @@ module nf
                            gaussian, linear, relu, leaky_relu,     &
                            sigmoid, softmax, softplus, step, tanhf, &
                            celu
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
 end module nf
@@ -0,0 +1,66 @@
+module nf_cross_attention_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_activation, only: softmax
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_multihead_attention_layer, only: multihead_attention_layer
+
+  implicit none
+
+  type, extends(multihead_attention_layer) :: cross_attention_layer
+    !! Cross Attention Layer
+    !! Source:
+    !! Bahdanau, D. (2014)
+    !! Neural machine translation by jointly learning to align and translate.
+    !! https://arxiv.org/pdf/1409.0473
+    real, allocatable :: gradient(:, :, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+  end type cross_attention_layer
+
+  interface cross_attention_layer
+    module function cross_attention_layer_cons(n_heads) result(res)
+      !! This function returns the `cross_attention_layer` instance.
+      integer, intent(in) :: sequence_length, model_dimension, n_heads
+      type(cross_attention_layer) :: res
+    end function cross_attention_layer_cons
+  end interface cross_attention_layer
+
+contains
+  module function cross_attention_layer_cons(n_heads) result(res)
+    !! This function returns the `cross_attention_layer` instance.
+    integer, intent(in) :: n_heads
+    type(cross_attention_layer) :: res
+    res % n_heads = n_heads
+  end function cross_attention_layer_cons
+
+  pure module subroutine backward(self, input, gradient)
+    !! Cross Attention Back propagation
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+    real, intent(in) :: gradient(:, :)
+
+    call self % common_backward(input(1, :, :), gradient)
+    self % gradient(1, :, :) = self % query_layer % gradient
+    self % gradient(2, :, :) = self % key_layer % gradient + self % value_layer % gradient
+  end subroutine backward
+
+  pure module subroutine forward(self, input)
+    !! Cross Attention Forward propagation
+    !! Input Shape (kind, sequence_length, model_dimension)
+    !! where kind is 1 for Query and 2 for Key-Value
+    class(cross_attention_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :, :)
+
+    call self % common_forward(input(1, :, :), input(2, :, :), input(2, :, :))
+  end subroutine forward
+
+  module subroutine init(self, input_shape)
+    class(cross_attention_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    call self % init_base(input_shape)
+    allocate(self % gradient(2, self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_cross_attention_layer
@@ -8,7 +8,16 @@ module nf_layer_constructors
   implicit none
 
   private
-  public :: conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
+  public :: &
+    conv2d, &
+    dense, &
+    dropout, &
+    flatten, &
+    input, &
+    linear2d, &
+    maxpool2d, &
+    reshape, &
+    self_attention
 
   interface input
 
@@ -213,6 +222,16 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
+  module function self_attention(num_heads) result(res)
+    !! Rank-2 (sequence_length, out_features) self attention constructor.
+    !! sequence_length and model_dimension are determined at layer initialization, based on the
+    !! output shape of the previous layer.
+    integer, intent(in) :: num_heads
+      !! Number of attention heads
+    type(layer) :: res
+      !! Resulting layer instance
+  end function self_attention
+
   end interface
 
 end module nf_layer_constructors
@@ -11,6 +11,7 @@
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
+  use nf_self_attention_layer, only: self_attention_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -170,4 +171,12 @@ module function linear2d(out_features) result(res)
 
   end function linear2d
 
+  module function self_attention(num_heads) result(res)
+    integer, intent(in) :: num_heads
+    type(layer) :: res
+
+    res % name = 'self_attention'
+    allocate(res % p, source=self_attention_layer(num_heads))
+  end function self_attention
+
 end submodule nf_layer_constructors_submodule
@@ -11,6 +11,7 @@
   use nf_maxpool2d_layer, only: maxpool2d_layer
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
+  use nf_self_attention_layer, only: self_attention_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -57,6 +58,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -79,6 +82,19 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(linear2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
+
+      type is(self_attention_layer)
+
+        select type(prev_layer => previous % p)
+          type is(input2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -240,6 +256,20 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(self_attention_layer)
+
+        ! Upstream layers permitted: input2d, linear2d
+        select type(prev_layer => input % p)
+          type is(input2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -279,6 +309,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(linear2d_layer)
         allocate(output, source=this_layer % output)
+      type is(self_attention_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -322,8 +354,8 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of conv2d, dropout, flatten, linear2d, or maxpool2d layers
-    ! is not known until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or
+    ! self_attention layers is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
@@ -333,6 +365,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(linear2d_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(self_attention_layer)
+        self % layer_shape = shape(this_layer % output)
       type is(maxpool2d_layer)
         self % layer_shape = shape(this_layer % output)
     end select
@@ -389,6 +423,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = 0
       type is (linear2d_layer)
         num_params = this_layer % get_num_params()
+      type is (self_attention_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -420,6 +456,8 @@ module function get_params(self) result(params)
         ! No parameters to get.
       type is (linear2d_layer)
         params = this_layer % get_params()
+      type is (self_attention_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -451,6 +489,8 @@ module function get_gradients(self) result(gradients)
         ! No gradients to get.
       type is (linear2d_layer)
         gradients = this_layer % get_gradients()
+      type is (self_attention_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -506,6 +546,9 @@ module subroutine set_params(self, params)
       type is (linear2d_layer)
         call this_layer % set_params(params)
 
+      type is (self_attention_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ foreach(execid`
`6`	`6`	`simple`
`7`	`7`	`sine`
`8`	`8`	`quadratic`
	`9`	`+ mha_simple`
`9`	`10`	`)`
`10`	`11`	`add_executable(${execid} ${execid}.f90)`
`11`	`12`	`target_link_libraries(${execid} PRIVATE`