-
Notifications
You must be signed in to change notification settings - Fork 0
/
cudamatmul.cuf
232 lines (160 loc) · 5.62 KB
/
cudamatmul.cuf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
! start the module containing the matrix multiply kernel
module mmul_mod
use cudafor
contains
! mmul_kernel computes A*B into C where A is NxM, B is MxL, C is then NxL
attributes(global) subroutine mmul_kernel( A, B, C, N, M, L )
real,device :: A(N,M), B(M,L), C(N,L)
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
! submatrices are declared to be in CUDA shared memory
real, shared :: Asub(16,16), Bsub(16,16)
! the value of C(i,j) being computed, a temporary scalar
real :: Cij
! Start execution, first get my thread indices
tx = threadidx%x
ty = threadidx%y
! This thread computes C(i,j) = sum(A(i,:) * B(:,j))
i = (blockidx%x-1) * 16 + tx
j = (blockidx%y-1) * 16 + ty
Cij = 0.0
! Do the k loop in chunks of 16, the block size
do kb = 1, M, 16
! Fill the submatrices; each of 16x16 threads in the thread block
! loads one element of Asub and Bsub
Asub(tx,ty) = A(i,kb+ty-1)
Bsub(tx,ty) = B(kb+tx-1,j)
! Wait until all elements are filled
call syncthreads()
! Multiply the two submatrices; ! Each of the 16x16 threads accumulates the
! dot product for its element of C(i,j)
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
! Synchronize to make sure all threads are done reading the submatrices before
! overwriting them in the next iteration of the kb loop
call syncthreads()
enddo
! Each of the 16x16 threads stores its element to the global C array
C(i,j) = Cij
end subroutine mmul_kernel
! The host routine to drive the matrix multiplication
subroutine mmul( A, B, C )
! assumed shape input arrays
real, dimension(:,:) :: A, B, C
! Array dimensions
integer :: N, M, L
! allocatable device arrays
real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
integer :: r
! Get the array sizes
real ctimeall, ctimekernel, flops, mflopskernel, mflopsall
integer c1, c2, c3, c4
! Begin execution, first determine the sizes of the input arrays
N = size( A, 1 )
M = size( A, 2 )
L = size( B, 2 )
! Start data xfer-inclusive timer and allocate the device arrays using
! F90 ALLOCATE
call system_clock( count=c1 )
allocate( Adev(N,M), Bdev(M,L), Cdev(N,L) )
! Copy A and B to the device using F90 array assignments
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
! Start data xfer-exclusive timer, launch the GPU kernel, wait for completion
call system_clock( count=c2 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev, Bdev, Cdev, N, M, L )
r = cudathreadsynchronize()
! Stop data xfer-exlusive timer, copy the results back, stop data xfer-
! inclusive timer
call system_clock( count=c3 )
C(1:N,1:L) = Cdev
call system_clock( count=c4 )
! Calculate inclusive/exclusive execution times, and report MFLOPS
flops = float(N) * float(M) * float(L)
ctimekernel = c3 - c2
mflopskernel = flops / ctimekernel
ctimeall = c4 - c1
mflopsall = flops / ctimeall
! Print out results
print *, 'Kernel time excluding data xfer:', ctimekernel, ' microseconds'
print *, 'Megaflops excluding data xfer: ', mflopskernel
print *, 'Total time including data xfer: ', ctimeall, ' microseconds'
print *, 'Megaflops including data xfer: ', mflopsall
! Deallocate device arrays and exit
deallocate( Adev, Bdev, Cdev )
end subroutine mmul
end module mmul_mod
! Main program to initialize arrays, invoke mmul, check results
program matmul
use mmul_mod
real,dimension(:,:),allocatable :: A,B,C,CC
integer N, M, L
integer idevice, istat
! Begin execution
N = 512
M = 1024
L = 512
idevice = 0
print *,' arrays sized ', N, ' by ', M, ' by ', L
allocate(A(N,M),B(M,L),C(N,L),CC(N,L))
! Initialize the A and B arrays; zero out the C array to be computed
! on the GPU, and the CC array to be computed on the host
do j = 1,M
do i = 1,N
A(i,j) = i*10 + j*1000
enddo
enddo
do j = 1,L
do i = 1,M
B(i,j) = i-j
enddo
enddo
do j = 1,L
do i = 1,N
CC(i,j) = 0.0
C(i,j) = 0.0
enddo
enddo
! Initialize CPU device
istat = cudaSetDevice(idevice)
! Call matrix multiply subroutine to execute on the GPU to compute C
print *,'calling mmul'
call mmul( A, B, C )
print *,' C(1,1) = ', C(1,1)
print *,' C(2,2) = ', C(2,2)
! Perform matrix multiply on host to compute CC
do i = 1,N
do j = 1,L
do k = 1,M
CC(i,j) = CC(i,j) + A(i,k)*B(k,j)
enddo
enddo
enddo
! Check for errors
ierr = 0
do j = 1,L
do i = 1,N
diff = abs(C(i,j) - CC(i,j))
denom = CC(i,j)
if ( denom == 0.0 ) denom = 1.0
error = diff / denom
if ( error > 2.0e-5 ) then
ierr = ierr + 1
if ( ierr <= 10 ) then
print *, 'C(',i,',',j,') = ',C(i,j), ' should be ', CC(i,j), ' error=', error
endif
endif
enddo
enddo
if( ierr == 0 )then
print *, ' No errors found'
else
print *, ierr, ' ERRORS FOUND!!!'
endif
end program