Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine similar kernels using cooperative groups #97

Merged
merged 3 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions src/auxiliary/configurators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# blocks to be used in the kernel, which optimizes the use of GPU resources.

# 1D kernel configurator
# We hardcode 32 threads per block for 1D kernels
# We hardcode 32 threads per block for 1D kernels.
function kernel_configurator_1d(kernel::HostKernel, x::Int)
# config = launch_configuration(kernel.fun) # not used in this case

Expand All @@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int)
return (threads = threads, blocks = blocks)
end

# 1D kernel configurator for cooperative launch
# Note that cooperative kernels can only launch as many blocks as there are SMs on the device,
# so we need to query the SM count first. Also, kernels launched with cooperative launch have
# to use stride loops to handle the constrained launch size.
function kernel_configurator_coop_1d(kernel::HostKernel, x::Int)
# config = launch_configuration(kernel.fun) # not used in this case
# Maybe pack properties into a struct
device = CUDA.device()
sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs

threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32
blocks = min(cld(x, threads), sm_count)

return (threads = threads, blocks = blocks)
end

# 2D kernel configurator
# We hardcode 32 threads for x dimension per block, and y dimension is determined
# by the number of threads returned by the launch configuration
# by the number of threads returned by the launch configuration.
function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
config = launch_configuration(kernel.fun) # get the number of threads

Expand All @@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
return (threads = threads, blocks = blocks)
end

# 2D kernel configurator for cooperative launch
# Note that cooperative kernels can only launch as many blocks as there are SMs on the device,
# so we need to query the SM count first. Also, kernels launched with cooperative launch have
# to use stride loops to handle the constrained launch size.
function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int)
config = launch_configuration(kernel.fun) # get the number of threads
# Maybe pack properties into a struct
device = CUDA.device()
sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs

# y dimension
dims_y1 = cld(x * y, 32)
dims_y2 = max(fld(config.threads, 32), 1)

dims_y = min(dims_y1, dims_y2)

# x dimension is hardcoded to warp size 32
threads = (32, dims_y)
blocks_x = cld(x, threads[1])
blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))

blocks = (blocks_x, blocks_y)

return (threads = threads, blocks = blocks)
end

# 3D kernel configurator
# We hardcode 32 threads for x dimension per block, y and z dimensions are determined
# by the number of threads returned by the launch configuration
# by the number of threads returned by the launch configuration.
function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
config = launch_configuration(kernel.fun) # get the number of threads

Expand All @@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
return (threads = threads, blocks = blocks)
end

# 3D kernel configurator for cooperative launch
# Note that cooperative kernels can only launch as many blocks as there are SMs on the device,
# so we need to query the SM count first. Also, kernels launched with cooperative launch have
# to use stride loops to handle the constrained launch size.
function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
config = launch_configuration(kernel.fun) # get the number of threads
# Maybe pack properties into a struct
device = CUDA.device()
sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs

# y dimension
dims_y1 = cld(x * y, 32)
dims_y2 = max(fld(config.threads, 32), 1)

dims_y = min(dims_y1, dims_y2)

# z dimension
dims_z1 = cld(x * y * z, 32 * dims_y)
dims_z2 = max(fld(config.threads, 32 * dims_y), 1)

dims_z = min(dims_z1, dims_z2)

# x dimension is hardcoded to warp size 32
threads = (32, dims_y, dims_z)
blocks_x = cld(x, threads[1])
blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y))

blocks = (blocks_x, blocks_y, blocks_z)

return (threads = threads, blocks = blocks)
end

# Deprecated old kernel configurators below

# function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})
Expand Down
12 changes: 5 additions & 7 deletions src/solvers/dg_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value
fstar_primary_upper, fstar_primary_lower,
fstar_secondary_upper, fstar_secondary_lower,
reverse_upper, reverse_lower, neighbor_ids, large_sides,
orientations, equations::AbstractEquations{2})
orientations)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
Expand Down Expand Up @@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
reverse_lower,
neighbor_ids,
large_sides,
orientations,
equations)
orientations)
mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
reverse_upper, reverse_lower, neighbor_ids, large_sides,
orientations, equations;
orientations;
kernel_configurator_3d(mortar_flux_copy_to_kernel,
size(surface_flux_values, 1),
size(surface_flux_values, 2),
Expand Down Expand Up @@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
reverse_lower,
neighbor_ids,
large_sides,
orientations,
equations)
orientations)
mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
reverse_upper, reverse_lower, neighbor_ids, large_sides,
orientations, equations;
orientations;
kernel_configurator_3d(mortar_flux_copy_to_kernel,
size(surface_flux_values, 1),
size(surface_flux_values, 2),
Expand Down
Loading
Loading