Skip to content

Commit

Permalink
Merge pull request #2561 from JuliaGPU/tb/rand
Browse files Browse the repository at this point in the history
Native RNG fixes for very large arrays
  • Loading branch information
maleadt authored Dec 11, 2024
2 parents 478a952 + 150af15 commit 860eb88
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 11 deletions.
22 changes: 11 additions & 11 deletions src/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ function Random.rand!(rng::RNG, A::AnyCuArray)

# grid-stride loop
threadId = threadIdx().x
window = blockDim().x * gridDim().x
offset = (blockIdx().x - 1) * blockDim().x
window = widemul(blockDim().x, gridDim().x)
offset = widemul(blockIdx().x - 1i32, blockDim().x)
while offset < length(A)
i = threadId + offset
if i <= length(A)
Expand Down Expand Up @@ -96,8 +96,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A

# grid-stride loop
threadId = threadIdx().x
window = (blockDim().x - 1) * gridDim().x
offset = (blockIdx().x - 1) * blockDim().x
window = widemul(blockDim().x, gridDim().x)
offset = widemul(blockIdx().x - 1i32, blockDim().x)
while offset < length(A)
i = threadId + offset
j = threadId + offset + window
Expand Down Expand Up @@ -129,8 +129,8 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A

# grid-stride loop
threadId = threadIdx().x
window = (blockDim().x - 1) * gridDim().x
offset = (blockIdx().x - 1) * blockDim().x
window = widemul(blockDim().x, gridDim().x)
offset = widemul(blockIdx().x - 1i32, blockDim().x)
while offset < length(A)
i = threadId + offset
if i <= length(A)
Expand All @@ -150,11 +150,11 @@ function Random.randn!(rng::RNG, A::AnyCuArray{<:Union{AbstractFloat,Complex{<:A
return
end

kernel = @cuda launch=false name="rand!" kernel(A, rng.seed, rng.counter)
config = launch_configuration(kernel.fun; max_threads=64)
threads = max(32, min(config.threads, length(A)÷2))
blocks = min(config.blocks, cld(cld(length(A), 2), threads))
kernel(A, rng.seed, rng.counter; threads, blocks)
# see note in `rand!` about the launch configuration
threads = 32
blocks = cld(cld(length(A), 2), threads)

@cuda threads=threads blocks=blocks name="randn!" kernel(A, rng.seed, rng.counter)

new_counter = Int64(rng.counter) + length(A)
overflow, remainder = fldmod(new_counter, typemax(UInt32))
Expand Down
7 changes: 7 additions & 0 deletions test/base/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,10 @@ end
end
end

@testset "counter overflow" begin
rng = CUDA.RNG()
# we may not be able to allocate over 4GB on the GPU, so use unified memory
c = CuArray{Float16, 5, CUDA.UnifiedMemory}(undef, 64, 32, 512, 32, 64)
rand!(rng, c)
randn!(rng, c)
end

0 comments on commit 860eb88

Please sign in to comment.