Skip to content

Commit f08f804

Browse files
committed
layernorm: rearrange into submodule
1 parent 70d0f27 commit f08f804

File tree

2 files changed

+127
-107
lines changed

2 files changed

+127
-107
lines changed

src/nf/nf_layernorm.f90

+17-107
Original file line numberDiff line numberDiff line change
@@ -41,111 +41,21 @@ module function layernorm_layer_cons() &
4141
end function layernorm_layer_cons
4242
end interface layernorm_layer
4343

44-
contains
45-
module function layernorm_layer_cons() &
46-
result(res)
47-
type(layernorm_layer) :: res
48-
49-
res % eps = 1e-5
50-
end function layernorm_layer_cons
51-
52-
pure module subroutine forward(self, input)
53-
class(layernorm_layer), intent(in out) :: self
54-
real, intent(in) :: input(:, :)
55-
real, allocatable :: normalized(:, :)
56-
integer :: i
57-
58-
allocate(normalized(self % sequence_length, self % model_dimension))
59-
60-
! mu = x - MEAN_last_dim(x)
61-
do concurrent(i = 1: self % model_dimension)
62-
self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
63-
end do
64-
65-
! square root of variance shifted be eps
66-
self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
67-
68-
! normalize mu by variance by first axis
69-
do concurrent(i = 1: self % model_dimension)
70-
normalized(:, i) = self % mu(:, i) / self % sigma
71-
end do
72-
73-
! forward through trainable params gamma and beta
74-
do concurrent(i = 1: self % sequence_length)
75-
self % output(i, :) = normalized(i, :) * self % gamma + self % beta
76-
end do
77-
78-
deallocate(normalized)
79-
end subroutine forward
80-
81-
pure module subroutine backward(self, input, gradient)
82-
class(layernorm_layer), intent(in out) :: self
83-
real, intent(in) :: input(:, :)
84-
real, intent(in) :: gradient(:, :)
85-
real, allocatable :: one_over_sigma(:, :)
86-
real, allocatable :: gradient_by_gamma_over_sigma(:, :)
87-
88-
allocate(one_over_sigma(self % sequence_length, self % model_dimension))
89-
allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
90-
91-
one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
92-
gradient_by_gamma_over_sigma = &
93-
gradient &
94-
* spread(self % gamma, dim=1, ncopies=self % sequence_length) &
95-
* one_over_sigma
96-
97-
! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
98-
self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
99-
100-
! d_output/d_beta = sum(d_output/d_y) * 1
101-
self % d_beta = sum(gradient, dim=1)
102-
103-
! From this article:
104-
! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
105-
! d_output/d_x = d_output/d_y * gamma/sigma
106-
! - d_output/d_y
107-
! - sum(d_output/d_y * gamma/sigma) / len
108-
! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
109-
self % gradient = &
110-
gradient_by_gamma_over_sigma &
111-
- spread(&
112-
sum(gradient_by_gamma_over_sigma, dim=2),&
113-
dim=2,&
114-
ncopies=self % model_dimension&
115-
) / self % model_dimension &
116-
- self % mu * spread(&
117-
sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
118-
dim=2,&
119-
ncopies=self % model_dimension&
120-
) / self % model_dimension
121-
122-
deallocate(one_over_sigma)
123-
deallocate(gradient_by_gamma_over_sigma)
124-
end subroutine backward
125-
126-
module subroutine init(self, input_shape)
127-
class(layernorm_layer), intent(in out) :: self
128-
integer, intent(in) :: input_shape(:)
129-
130-
if (size(input_shape) /= 2) then
131-
error stop "LayerNorm Layer accepts 2D input"
132-
end if
133-
self % sequence_length = input_shape(1)
134-
self % model_dimension = input_shape(2)
135-
136-
! default initialization from PyTorch
137-
allocate(self % gamma(self % model_dimension))
138-
self % gamma = 1.
139-
allocate(self % beta(self % model_dimension))
140-
self % beta = 0.
141-
142-
allocate(self % d_gamma(self % model_dimension))
143-
allocate(self % d_beta(self % model_dimension))
144-
allocate(self % gradient(self % sequence_length, self % model_dimension))
145-
146-
allocate(self % mu(self % sequence_length, self % model_dimension))
147-
allocate(self % sigma(self % sequence_length))
148-
149-
allocate(self % output(self % sequence_length, self % model_dimension))
150-
end subroutine init
44+
interface
45+
pure module subroutine forward(self, input)
46+
class(layernorm_layer), intent(in out) :: self
47+
real, intent(in) :: input(:, :)
48+
end subroutine forward
49+
50+
pure module subroutine backward(self, input, gradient)
51+
class(layernorm_layer), intent(in out) :: self
52+
real, intent(in) :: input(:, :)
53+
real, intent(in) :: gradient(:, :)
54+
end subroutine backward
55+
56+
module subroutine init(self, input_shape)
57+
class(layernorm_layer), intent(in out) :: self
58+
integer, intent(in) :: input_shape(:)
59+
end subroutine init
60+
end interface
15161
end module nf_layernorm_layer

src/nf/nf_layernorm_submodule.f90

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
submodule(nf_layernorm_layer) nf_layernorm_layer_submodule
2+
implicit none
3+
contains
4+
module function layernorm_layer_cons() &
5+
result(res)
6+
type(layernorm_layer) :: res
7+
8+
res % eps = 1e-5
9+
end function layernorm_layer_cons
10+
11+
pure module subroutine forward(self, input)
12+
class(layernorm_layer), intent(in out) :: self
13+
real, intent(in) :: input(:, :)
14+
real, allocatable :: normalized(:, :)
15+
integer :: i
16+
17+
allocate(normalized(self % sequence_length, self % model_dimension))
18+
19+
! mu = x - MEAN_last_dim(x)
20+
do concurrent(i = 1: self % model_dimension)
21+
self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
22+
end do
23+
24+
! square root of variance shifted be eps
25+
self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
26+
27+
! normalize mu by variance by first axis
28+
do concurrent(i = 1: self % model_dimension)
29+
normalized(:, i) = self % mu(:, i) / self % sigma
30+
end do
31+
32+
! forward through trainable params gamma and beta
33+
do concurrent(i = 1: self % sequence_length)
34+
self % output(i, :) = normalized(i, :) * self % gamma + self % beta
35+
end do
36+
37+
deallocate(normalized)
38+
end subroutine forward
39+
40+
pure module subroutine backward(self, input, gradient)
41+
class(layernorm_layer), intent(in out) :: self
42+
real, intent(in) :: input(:, :)
43+
real, intent(in) :: gradient(:, :)
44+
real, allocatable :: one_over_sigma(:, :)
45+
real, allocatable :: gradient_by_gamma_over_sigma(:, :)
46+
47+
allocate(one_over_sigma(self % sequence_length, self % model_dimension))
48+
allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
49+
50+
one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
51+
gradient_by_gamma_over_sigma = &
52+
gradient &
53+
* spread(self % gamma, dim=1, ncopies=self % sequence_length) &
54+
* one_over_sigma
55+
56+
! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
57+
self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
58+
59+
! d_output/d_beta = sum(d_output/d_y) * 1
60+
self % d_beta = sum(gradient, dim=1)
61+
62+
! From this article:
63+
! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
64+
! d_output/d_x = d_output/d_y * gamma/sigma
65+
! - d_output/d_y
66+
! - sum(d_output/d_y * gamma/sigma) / len
67+
! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
68+
self % gradient = &
69+
gradient_by_gamma_over_sigma &
70+
- spread(&
71+
sum(gradient_by_gamma_over_sigma, dim=2),&
72+
dim=2,&
73+
ncopies=self % model_dimension&
74+
) / self % model_dimension &
75+
- self % mu * spread(&
76+
sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
77+
dim=2,&
78+
ncopies=self % model_dimension&
79+
) / self % model_dimension
80+
81+
deallocate(one_over_sigma)
82+
deallocate(gradient_by_gamma_over_sigma)
83+
end subroutine backward
84+
85+
module subroutine init(self, input_shape)
86+
class(layernorm_layer), intent(in out) :: self
87+
integer, intent(in) :: input_shape(:)
88+
89+
if (size(input_shape) /= 2) then
90+
error stop "LayerNorm Layer accepts 2D input"
91+
end if
92+
self % sequence_length = input_shape(1)
93+
self % model_dimension = input_shape(2)
94+
95+
! default initialization from PyTorch
96+
allocate(self % gamma(self % model_dimension))
97+
self % gamma = 1.
98+
allocate(self % beta(self % model_dimension))
99+
self % beta = 0.
100+
101+
allocate(self % d_gamma(self % model_dimension))
102+
allocate(self % d_beta(self % model_dimension))
103+
allocate(self % gradient(self % sequence_length, self % model_dimension))
104+
105+
allocate(self % mu(self % sequence_length, self % model_dimension))
106+
allocate(self % sigma(self % sequence_length))
107+
108+
allocate(self % output(self % sequence_length, self % model_dimension))
109+
end subroutine init
110+
end submodule nf_layernorm_layer_submodule

0 commit comments

Comments
 (0)