forked from JuliaGPU/CUDAnative.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpeakflops.jl
54 lines (41 loc) · 1.2 KB
/
peakflops.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
using CUDAdrv, CUDAnative
using Test
"Dummy kernel doing 100 FMAs."
function kernel_100fma(a, b, c, out)
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
@inbounds a_val = a[i]
@inbounds b_val = b[i]
@inbounds c_val = c[i]
for j in 1:33
a_val = CUDAnative.fma(a_val, b_val, c_val)
b_val = CUDAnative.fma(a_val, b_val, c_val)
c_val = CUDAnative.fma(a_val, b_val, c_val)
end
@inbounds out[i] = CUDAnative.fma(a_val, b_val, c_val)
return
end
function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
ctx = CuContext(dev)
dims = (n, n)
a = round.(rand(Float32, dims) * 100)
b = round.(rand(Float32, dims) * 100)
c = round.(rand(Float32, dims) * 100)
d_a = CuArray(a)
d_b = CuArray(b)
d_c = CuArray(c)
d_out = similar(d_a)
len = prod(dims)
threads = min(len, 1024)
blocks = len ÷ threads
# warm-up
@cuda kernel_100fma(d_a, d_b, d_c, d_out)
synchronize(ctx)
secs = CUDAdrv.@elapsed begin
@cuda blocks=blocks threads=threads kernel_100fma(d_a, d_b, d_c, d_out)
end
flopcount = 200*len
flops = flopcount / secs
destroy!(ctx)
return flops
end
println(peakflops())