Combine similar kernels using cooperative groups (#97)

* Cooperative kernels * Complete * Remove benchmark
trixi-gpu · Dec 21, 2024 · ccb6d67 · ccb6d67
1 parent e3080cf
commit ccb6d67
Show file tree

Hide file tree

Showing 3 changed files with 857 additions and 317 deletions.
diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl
@@ -2,7 +2,7 @@
 # blocks to be used in the kernel, which optimizes the use of GPU resources.
 
 # 1D kernel configurator
-# We hardcode 32 threads per block for 1D kernels
+# We hardcode 32 threads per block for 1D kernels.
 function kernel_configurator_1d(kernel::HostKernel, x::Int)
     # config = launch_configuration(kernel.fun) # not used in this case
 
@@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 1D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_1d(kernel::HostKernel, x::Int)
+    # config = launch_configuration(kernel.fun) # not used in this case
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32
+    blocks = min(cld(x, threads), sm_count)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 2D kernel configurator
 # We hardcode 32 threads for x dimension per block, and y dimension is determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 2D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+
+    blocks = (blocks_x, blocks_y)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 3D kernel configurator
 # We hardcode 32 threads for x dimension per block, y and z dimensions are determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 3D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # z dimension
+    dims_z1 = cld(x * y * z, 32 * dims_y)
+    dims_z2 = max(fld(config.threads, 32 * dims_y), 1)
+
+    dims_z = min(dims_z1, dims_z2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y, dims_z)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+    blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y))
+
+    blocks = (blocks_x, blocks_y, blocks_z)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # Deprecated old kernel configurators below
 
 # function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})

diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
@@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value
                                      fstar_primary_upper, fstar_primary_lower,
                                      fstar_secondary_upper, fstar_secondary_lower,
                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{2})
+                                     orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
@@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),