Skip to content

Commit

Permalink
Optimize kernel configurators (#62)
Browse files Browse the repository at this point in the history
* Start

* Complete 1D

* Complete 2D

* Complete 3D

* Minor change
  • Loading branch information
huiyuxie authored Oct 4, 2024
1 parent 8da7938 commit 471cb7d
Show file tree
Hide file tree
Showing 7 changed files with 351 additions and 380 deletions.
84 changes: 69 additions & 15 deletions src/auxiliary/configurators.jl
Original file line number Diff line number Diff line change
@@ -1,32 +1,86 @@
# Kernel configurators are used for determining the number of threads and
# blocks to be used in the kernel, which optimizes the use of GPU resources.

# Kernel configurator for 1D CUDA array
function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})
config = launch_configuration(kernel.fun)
# 1D kernel configurator
# We hardcode 32 threads per block for 1D kernels
function kernel_configurator_1d(kernel::HostKernel, x::Int)
# config = launch_configuration(kernel.fun) # not used in this case

threads = min(length(array), config.threads)
blocks = cld(length(array), threads)
threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32
blocks = cld(x, threads[1])

return (threads = threads, blocks = blocks)
end

# Kernel configurator for 2D CUDA array
function configurator_2d(kernel::HostKernel, array::CuArray{<:Any, 2})
config = launch_configuration(kernel.fun)
# 2D kernel configurator
# We hardcode 32 threads for x dimension per block, and y dimension is determined
# by the number of threads returned by the launch configuration
function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
config = launch_configuration(kernel.fun) # get the number of threads

threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 2))), 2))
blocks = map(cld, size(array), threads)
# y dimension
dims_y1 = cld(x * y, 32)
dims_y2 = max(fld(config.threads, 32), 1)

dims_y = min(dims_y1, dims_y2)

# x dimension is hardcoded to warp size 32
threads = (32, dims_y)
blocks = (cld(x, threads[1]), cld(y, threads[2]))

return (threads = threads, blocks = blocks)
end

# Kernel configurator for 3D CUDA array
function configurator_3d(kernel::HostKernel, array::CuArray{<:Any, 3})
config = launch_configuration(kernel.fun)
# 3D kernel configurator
# We hardcode 32 threads for x dimension per block, y and z dimensions are determined
# by the number of threads returned by the launch configuration
function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
config = launch_configuration(kernel.fun) # get the number of threads

# y dimension
dims_y1 = cld(x * y, 32)
dims_y2 = max(fld(config.threads, 32), 1)

dims_y = min(dims_y1, dims_y2)

threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 3))), 3))
blocks = map(cld, size(array), threads)
# z dimension
dims_z1 = cld(x * y * z, 32 * dims_y)
dims_z2 = max(fld(config.threads, 32 * dims_y), 1)

dims_z = min(dims_z1, dims_z2)

# x dimension is hardcoded to warp size 32
threads = (32, dims_y, dims_z)
blocks = (cld(x, threads[1]), cld(y, threads[2]), cld(z, threads[3]))

return (threads = threads, blocks = blocks)
end

# Deprecated old kernel configurators below

# function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})
# config = launch_configuration(kernel.fun)

# threads = min(length(array), config.threads)
# blocks = cld(length(array), threads)

# return (threads = threads, blocks = blocks)
# end

# function configurator_2d(kernel::HostKernel, array::CuArray{<:Any, 2})
# config = launch_configuration(kernel.fun)

# threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 2))), 2))
# blocks = map(cld, size(array), threads)

# return (threads = threads, blocks = blocks)
# end

# function configurator_3d(kernel::HostKernel, array::CuArray{<:Any, 3})
# config = launch_configuration(kernel.fun)

# threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 3))), 3))
# blocks = map(cld, size(array), threads)

# return (threads = threads, blocks = blocks)
# end
Loading

0 comments on commit 471cb7d

Please sign in to comment.