@@ -41,111 +41,21 @@ module function layernorm_layer_cons() &
41
41
end function layernorm_layer_cons
42
42
end interface layernorm_layer
43
43
44
- contains
45
- module function layernorm_layer_cons () &
46
- result(res)
47
- type (layernorm_layer) :: res
48
-
49
- res % eps = 1e-5
50
- end function layernorm_layer_cons
51
-
52
- pure module subroutine forward(self, input)
53
- class(layernorm_layer), intent (in out ) :: self
54
- real , intent (in ) :: input(:, :)
55
- real , allocatable :: normalized(:, :)
56
- integer :: i
57
-
58
- allocate (normalized(self % sequence_length, self % model_dimension))
59
-
60
- ! mu = x - MEAN_last_dim(x)
61
- do concurrent(i = 1 : self % model_dimension)
62
- self % mu(:, i) = input(:, i) - (sum (input, dim= 2 ) / self % model_dimension)
63
- end do
64
-
65
- ! square root of variance shifted be eps
66
- self % sigma = sqrt ((sum (self % mu ** 2 , dim= 2 ) / self % model_dimension) + self % eps)
67
-
68
- ! normalize mu by variance by first axis
69
- do concurrent(i = 1 : self % model_dimension)
70
- normalized(:, i) = self % mu(:, i) / self % sigma
71
- end do
72
-
73
- ! forward through trainable params gamma and beta
74
- do concurrent(i = 1 : self % sequence_length)
75
- self % output(i, :) = normalized(i, :) * self % gamma + self % beta
76
- end do
77
-
78
- deallocate (normalized)
79
- end subroutine forward
80
-
81
- pure module subroutine backward(self, input, gradient)
82
- class(layernorm_layer), intent (in out ) :: self
83
- real , intent (in ) :: input(:, :)
84
- real , intent (in ) :: gradient(:, :)
85
- real , allocatable :: one_over_sigma(:, :)
86
- real , allocatable :: gradient_by_gamma_over_sigma(:, :)
87
-
88
- allocate (one_over_sigma(self % sequence_length, self % model_dimension))
89
- allocate (gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
90
-
91
- one_over_sigma = (1 / spread (self % sigma, dim= 2 , ncopies= self % model_dimension))
92
- gradient_by_gamma_over_sigma = &
93
- gradient &
94
- * spread (self % gamma, dim= 1 , ncopies= self % sequence_length) &
95
- * one_over_sigma
96
-
97
- ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
98
- self % d_gamma = sum (gradient * self % mu * one_over_sigma, dim= 1 )
99
-
100
- ! d_output/d_beta = sum(d_output/d_y) * 1
101
- self % d_beta = sum (gradient, dim= 1 )
102
-
103
- ! From this article:
104
- ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
105
- ! d_output/d_x = d_output/d_y * gamma/sigma
106
- ! - d_output/d_y
107
- ! - sum(d_output/d_y * gamma/sigma) / len
108
- ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
109
- self % gradient = &
110
- gradient_by_gamma_over_sigma &
111
- - spread (&
112
- sum (gradient_by_gamma_over_sigma, dim= 2 ),&
113
- dim= 2 ,&
114
- ncopies= self % model_dimension&
115
- ) / self % model_dimension &
116
- - self % mu * spread (&
117
- sum (gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2 ), dim= 2 ),&
118
- dim= 2 ,&
119
- ncopies= self % model_dimension&
120
- ) / self % model_dimension
121
-
122
- deallocate (one_over_sigma)
123
- deallocate (gradient_by_gamma_over_sigma)
124
- end subroutine backward
125
-
126
- module subroutine init (self , input_shape )
127
- class(layernorm_layer), intent (in out ) :: self
128
- integer , intent (in ) :: input_shape(:)
129
-
130
- if (size (input_shape) /= 2 ) then
131
- error stop " LayerNorm Layer accepts 2D input"
132
- end if
133
- self % sequence_length = input_shape(1 )
134
- self % model_dimension = input_shape(2 )
135
-
136
- ! default initialization from PyTorch
137
- allocate (self % gamma(self % model_dimension))
138
- self % gamma = 1 .
139
- allocate (self % beta(self % model_dimension))
140
- self % beta = 0 .
141
-
142
- allocate (self % d_gamma(self % model_dimension))
143
- allocate (self % d_beta(self % model_dimension))
144
- allocate (self % gradient(self % sequence_length, self % model_dimension))
145
-
146
- allocate (self % mu(self % sequence_length, self % model_dimension))
147
- allocate (self % sigma(self % sequence_length))
148
-
149
- allocate (self % output(self % sequence_length, self % model_dimension))
150
- end subroutine init
44
+ interface
45
+ pure module subroutine forward(self, input)
46
+ class(layernorm_layer), intent (in out ) :: self
47
+ real , intent (in ) :: input(:, :)
48
+ end subroutine forward
49
+
50
+ pure module subroutine backward(self, input, gradient)
51
+ class(layernorm_layer), intent (in out ) :: self
52
+ real , intent (in ) :: input(:, :)
53
+ real , intent (in ) :: gradient(:, :)
54
+ end subroutine backward
55
+
56
+ module subroutine init (self , input_shape )
57
+ class(layernorm_layer), intent (in out ) :: self
58
+ integer , intent (in ) :: input_shape(:)
59
+ end subroutine init
60
+ end interface
151
61
end module nf_layernorm_layer
0 commit comments