forked from JuliaGPU/CUDAnative.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultigpu.jl
45 lines (36 loc) · 1.03 KB
/
multigpu.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
using CUDAdrv, CUDAnative
using Test
function vadd(gpu, a, b, c)
i = threadIdx().x + blockDim().x * ((blockIdx().x-1) + (gpu-1) * gridDim().x)
c[i] = a[i] + b[i]
return
end
gpus = Int(length(devices()))
dims = (gpus,3,4)
a = round.(rand(Float32, dims) * 100)
b = round.(rand(Float32, dims) * 100)
# FIXME: CuArray doesn't tie in with unified memory yet
buf_a = Mem.alloc(sizeof(a), true)
Mem.upload!(buf_a, a)
d_a = CuArray{Float32,3}(dims, buf_a)
buf_b = Mem.alloc(sizeof(a), true)
Mem.upload!(buf_b, b)
d_b = CuArray{Float32,3}(dims, buf_b)
buf_c = Mem.alloc(sizeof(a), true)
d_c = CuArray{Float32,3}(dims, buf_c)
len = prod(dims)
blocks = gpus
threads = len ÷ blocks
for (gpu,dev) in enumerate(devices())
@debug "Allocating slice $gpu on device $(name(dev))"
device!(dev)
@cuda blocks=blocks÷gpus threads=threads vadd(gpu, d_a, d_b, d_c)
end
@debug "Synchronizing devices"
for dev in devices()
# NOTE: normally you'd use events and wait for them
device!(dev)
synchronize()
end
c = Array(d_c)
@test a+b ≈ c