From ccb6d678b3896f1a464e781642186ab1d0818f80 Mon Sep 17 00:00:00 2001
From: Huiyu Xie <huiyuxie.sde@gmail.com>
Date: Fri, 20 Dec 2024 22:35:27 -1000
Subject: [PATCH] Combine similar kernels using cooperative groups (#97)

* Cooperative kernels

* Complete

* Remove benchmark
---
 src/auxiliary/configurators.jl |   81 ++-
 src/solvers/dg_2d.jl           |   12 +-
 src/solvers/dg_3d.jl           | 1081 +++++++++++++++++++++++---------
 3 files changed, 857 insertions(+), 317 deletions(-)

diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl
index c2e04a7..813576a 100644
--- a/src/auxiliary/configurators.jl
+++ b/src/auxiliary/configurators.jl
@@ -2,7 +2,7 @@
 # blocks to be used in the kernel, which optimizes the use of GPU resources.
 
 # 1D kernel configurator
-# We hardcode 32 threads per block for 1D kernels
+# We hardcode 32 threads per block for 1D kernels.
 function kernel_configurator_1d(kernel::HostKernel, x::Int)
     # config = launch_configuration(kernel.fun) # not used in this case
 
@@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 1D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_1d(kernel::HostKernel, x::Int)
+    # config = launch_configuration(kernel.fun) # not used in this case
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32
+    blocks = min(cld(x, threads), sm_count)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 2D kernel configurator
 # We hardcode 32 threads for x dimension per block, and y dimension is determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 2D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+
+    blocks = (blocks_x, blocks_y)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 3D kernel configurator
 # We hardcode 32 threads for x dimension per block, y and z dimensions are determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 3D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # z dimension
+    dims_z1 = cld(x * y * z, 32 * dims_y)
+    dims_z2 = max(fld(config.threads, 32 * dims_y), 1)
+
+    dims_z = min(dims_z1, dims_z2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y, dims_z)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+    blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y))
+
+    blocks = (blocks_x, blocks_y, blocks_z)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # Deprecated old kernel configurators below
 
 # function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index de0643e..e7cae9f 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value
                                      fstar_primary_upper, fstar_primary_lower,
                                      fstar_secondary_upper, fstar_secondary_lower,
                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{2})
+                                     orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
@@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index e14df20..8bdf846 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -890,126 +890,248 @@ function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lowe
     return nothing
 end
 
-# Kernel for interpolating data large to small on mortars - step 1
-function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                             tmp_lower_right, u, forward_upper,
-                                             forward_lower, neighbor_ids, large_sides, orientations)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 &&
-        k <= size(tmp_upper_left, 5))
-        u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
-
-        j1 = div(j - 1, u2) + 1
-        j2 = rem(j - 1, u2) + 1
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-        large_element = neighbor_ids[5, k]
-
-        leftright = large_side
-
-        @inbounds begin
-            for j1j1 in axes(forward_lower, 2)
-                tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                             large_element] * (2 - large_side)
-
-                tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                              large_element] * (2 - large_side)
-
-                tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                             large_element] * (2 - large_side)
-
-                tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                              large_element] * (2 - large_side)
-            end
-
-            for j1j1 in axes(forward_lower, 2)
-                tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                             large_element] * (large_side - 1)
-
-                tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                              large_element] * (large_side - 1)
-
-                tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                             large_element] * (large_side - 1)
-
-                tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                              large_element] * (large_side - 1)
-            end
-        end
-    end
-
-    return nothing
-end
-
-# Kernel for interpolating data large to small on mortars - step 2
-function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left,
-                                             u_lower_right, tmp_upper_left, tmp_upper_right,
-                                             tmp_lower_left, tmp_lower_right, forward_upper,
-                                             forward_lower, large_sides)
+# # Kernel for interpolating data large to small on mortars - step 1
+# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                              tmp_lower_right, u, forward_upper,
+#                                              forward_lower, neighbor_ids, large_sides, orientations)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 &&
+#         k <= size(tmp_upper_left, 5))
+#         u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
+
+#         j1 = div(j - 1, u2) + 1
+#         j2 = rem(j - 1, u2) + 1
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+#         large_element = neighbor_ids[5, k]
+
+#         leftright = large_side
+
+#         @inbounds begin
+#             for j1j1 in axes(forward_lower, 2)
+#                 tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                              large_element] * (2 - large_side)
+
+#                 tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                               large_element] * (2 - large_side)
+
+#                 tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                              large_element] * (2 - large_side)
+
+#                 tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                               large_element] * (2 - large_side)
+#             end
+
+#             for j1j1 in axes(forward_lower, 2)
+#                 tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                              large_element] * (large_side - 1)
+
+#                 tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                               large_element] * (large_side - 1)
+
+#                 tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                              large_element] * (large_side - 1)
+
+#                 tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                               large_element] * (large_side - 1)
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for interpolating data large to small on mortars - step 2
+# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left,
+#                                              u_lower_right, tmp_upper_left, tmp_upper_right,
+#                                              tmp_lower_left, tmp_lower_right, forward_upper,
+#                                              forward_lower, large_sides)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 &&
+#         k <= size(u_upper_left, 5))
+#         u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2)
+
+#         j1 = div(j - 1, u2) + 1
+#         j2 = rem(j - 1, u2) + 1
+
+#         leftright = large_sides[k]
+
+#         @inbounds begin
+#             for j2j2 in axes(forward_upper, 2)
+#                 u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+#                                                          tmp_upper_left[leftright, i, j1, j2j2, k]
+
+#                 u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+#                                                           tmp_upper_right[leftright, i, j1, j2j2, k]
+
+#                 u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+#                                                          tmp_lower_left[leftright, i, j1, j2j2, k]
+
+#                 u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+#                                                           tmp_lower_right[leftright, i, j1, j2j2, k]
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# Kernel for interpolating data large to small on mortars
+function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
+                                             tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right,
+                                             u, forward_upper, forward_lower, neighbor_ids, large_sides,
+                                             orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 &&
-        k <= size(u_upper_left, 5))
-        u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2)
-
-        j1 = div(j - 1, u2) + 1
-        j2 = rem(j - 1, u2) + 1
-
-        leftright = large_sides[k]
-
-        @inbounds begin
-            for j2j2 in axes(forward_upper, 2)
-                u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
-                                                         tmp_upper_left[leftright, i, j1, j2j2, k]
-
-                u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
-                                                          tmp_upper_right[leftright, i, j1, j2j2, k]
-
-                u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
-                                                         tmp_lower_left[leftright, i, j1, j2j2, k]
-
-                u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
-                                                          tmp_lower_right[leftright, i, j1, j2j2, k]
+    # Loop stride for each dimension
+    stride_x = gridDim().x * blockDim().x
+    stride_y = gridDim().y * blockDim().y
+    stride_z = gridDim().z * blockDim().z
+
+    # Cooperative kernel needs stride loops to handle the constrained launch size
+    while i <= size(tmp_upper_left, 2)
+        while j <= size(tmp_upper_left, 3)^2
+            while k <= size(tmp_upper_left, 5)
+                u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
+
+                j1 = div(j - 1, u2) + 1
+                j2 = rem(j - 1, u2) + 1
+
+                large_side = large_sides[k]
+                orientation = orientations[k]
+                large_element = neighbor_ids[5, k]
+
+                leftright = large_side
+
+                @inbounds begin
+                    for j1j1 in axes(forward_lower, 2)
+                        tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                     large_element] * (2 - large_side)
+
+                        tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                      large_element] * (2 - large_side)
+
+                        tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                     large_element] * (2 - large_side)
+
+                        tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                      large_element] * (2 - large_side)
+                    end
+
+                    for j1j1 in axes(forward_lower, 2)
+                        tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                           3),
+                                                                     large_element] * (large_side - 1)
+
+                        tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                            3),
+                                                                      large_element] * (large_side - 1)
+
+                        tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                           3),
+                                                                     large_element] * (large_side - 1)
+
+                        tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                            3),
+                                                                      large_element] * (large_side - 1)
+                    end
+
+                    # Grid scope synchronization
+                    grid = CG.this_grid()
+                    CG.sync(grid)
+
+                    for j2j2 in axes(forward_upper, 2)
+                        u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+                                                                 tmp_upper_left[leftright, i, j1, j2j2, k]
+
+                        u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+                                                                  tmp_upper_right[leftright, i, j1, j2j2, k]
+
+                        u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+                                                                 tmp_lower_left[leftright, i, j1, j2j2, k]
+
+                        u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+                                                                  tmp_lower_right[leftright, i, j1, j2j2, k]
+                    end
+                end
+                k += stride_z
             end
+            j += stride_y
         end
+        i += stride_x
     end
 
     return nothing
@@ -1150,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right
 end
 
 # # Kernel for copying mortar fluxes small to small and small to large - step 1
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
-                                     tmp_lower_left, tmp_lower_right,
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
+#                                      tmp_lower_left, tmp_lower_right,
+#                                      fstar_primary_upper_left, fstar_primary_upper_right,
+#                                      fstar_primary_lower_left, fstar_primary_lower_right,
+#                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                      fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         lower_left_element = neighbor_ids[1, k]
+#         lower_right_element = neighbor_ids[2, k]
+#         upper_left_element = neighbor_ids[3, k]
+#         upper_right_element = neighbor_ids[4, k]
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation + large_side - 2
+
+#         surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j1j1 in axes(reverse_upper, 2)
+#                 tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_upper_left[i, j1j1, j2, k]
+#                 tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_upper_right[i, j1j1, j2, k]
+#                 tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_lower_left[i, j1j1, j2, k]
+#                 tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_lower_right[i, j1j1, j2, k]
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for copying mortar fluxes small to small and small to large - step 2
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                      tmp_upper_right, tmp_lower_left, tmp_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations, equations::AbstractEquations{3})
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # See step 1 for the explanation of the following expression
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j2j2 in axes(reverse_lower, 2)
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#             end
+
+#             surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#         end
+#     end
+
+#     return nothing
+# end
+
+# Kernel for copying mortar fluxes small to small and small to large
+function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values,
+                                     tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right,
                                      fstar_primary_upper_left, fstar_primary_upper_right,
                                      fstar_primary_lower_left, fstar_primary_lower_right,
                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
@@ -1162,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
-
-        lower_left_element = neighbor_ids[1, k]
-        lower_right_element = neighbor_ids[2, k]
-        upper_left_element = neighbor_ids[3, k]
-        upper_right_element = neighbor_ids[4, k]
-        large_element = neighbor_ids[5, k]
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation + large_side - 2
-
-        surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation - large_side + 1
-
-        @inbounds begin
-            for j1j1 in axes(reverse_upper, 2)
-                tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_upper_left[i, j1j1, j2, k]
-                tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_upper_right[i, j1j1, j2, k]
-                tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_lower_left[i, j1j1, j2, k]
-                tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_lower_right[i, j1j1, j2, k]
-            end
-        end
-    end
-
-    return nothing
-end
-
-# Kernel for copying mortar fluxes small to small and small to large - step 2
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                                     tmp_upper_right, tmp_lower_left, tmp_lower_right,
-                                     reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{3})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
-
-        large_element = neighbor_ids[5, k]
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # See step 1 for the explanation of the following expression
-        direction = 2 * orientation - large_side + 1
+    # Loop stride for each dimension
+    stride_x = gridDim().x * blockDim().x
+    stride_y = gridDim().y * blockDim().y
+    stride_z = gridDim().z * blockDim().z
+
+    # Cooperative kernel needs stride loops to handle the constrained launch size
+    while i <= size(surface_flux_values, 1)
+        while j <= size(surface_flux_values, 2)^2
+            while k <= length(orientations)
+                j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+                j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+                lower_left_element = neighbor_ids[1, k]
+                lower_right_element = neighbor_ids[2, k]
+                upper_left_element = neighbor_ids[3, k]
+                upper_right_element = neighbor_ids[4, k]
+                large_element = neighbor_ids[5, k]
+
+                large_side = large_sides[k]
+                orientation = orientations[k]
+
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation + large_side - 2
+
+                surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation - large_side + 1
+
+                @inbounds begin
+                    for j1j1 in axes(reverse_upper, 2)
+                        tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_upper_left[i, j1j1, j2, k]
+                        tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_upper_right[i, j1j1, j2, k]
+                        tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_lower_left[i, j1j1, j2, k]
+                        tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_lower_right[i, j1j1, j2, k]
+                    end
+                end
 
-        @inbounds begin
-            for j2j2 in axes(reverse_lower, 2)
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
+                # Grid scope synchronization
+                grid = CG.this_grid()
+                CG.sync(grid)
+
+                @inbounds begin
+                    for j2j2 in axes(reverse_lower, 2)
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                    end
+
+                    surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                end
+                k += stride_z
             end
-
-            surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
-                                                                                               direction,
-                                                                                               large_element]
+            j += stride_y
         end
+        i += stride_x
     end
 
     return nothing
@@ -1857,7 +2089,81 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D
     @assert iszero(length(cache.mortars.orientations))
 end
 
-# Pack kernels for prolonging solution to mortars
+# # Pack kernels for prolonging solution to mortars
+# function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     forward_upper = CuArray(dg.mortar.forward_upper)
+#     forward_lower = CuArray(dg.mortar.forward_lower)
+
+#     prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper_left,
+#                                                                                                 u_upper_right,
+#                                                                                                 u_lower_left,
+#                                                                                                 u_lower_right,
+#                                                                                                 u,
+#                                                                                                 neighbor_ids,
+#                                                                                                 large_sides,
+#                                                                                                 orientations)
+#     prolong_mortars_small2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, u,
+#                                        neighbor_ids, large_sides, orientations;
+#                                        kernel_configurator_3d(prolong_mortars_small2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     tmp_upper_left = zero(similar(u_upper_left)) # undef to zero
+#     tmp_upper_right = zero(similar(u_upper_right)) # undef to zero
+#     tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
+#     tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
+
+#     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left,
+#                                                                                                 tmp_upper_right,
+#                                                                                                 tmp_lower_left,
+#                                                                                                 tmp_lower_right,
+#                                                                                                 u,
+#                                                                                                 forward_upper,
+#                                                                                                 forward_lower,
+#                                                                                                 neighbor_ids,
+#                                                                                                 large_sides,
+#                                                                                                 orientations)
+#     prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                        tmp_lower_right, u, forward_upper, forward_lower,
+#                                        neighbor_ids, large_sides, orientations;
+#                                        kernel_configurator_3d(prolong_mortars_large2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left,
+#                                                                                                 u_upper_right,
+#                                                                                                 u_lower_left,
+#                                                                                                 u_lower_right,
+#                                                                                                 tmp_upper_left,
+#                                                                                                 tmp_upper_right,
+#                                                                                                 tmp_lower_left,
+#                                                                                                 tmp_lower_right,
+#                                                                                                 forward_upper,
+#                                                                                                 forward_lower,
+#                                                                                                 large_sides)
+#     prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
+#                                        tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                        tmp_lower_right, forward_upper, forward_lower, large_sides;
+#                                        kernel_configurator_3d(prolong_mortars_large2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     return nothing
+# end
+
+# Pack kernels for prolonging solution to mortars (optimized)
 function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
     neighbor_ids = cache.mortars.neighbor_ids
     large_sides = cache.mortars.large_sides
@@ -1891,25 +2197,6 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG
     tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
     tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
 
-    # TODO: Combine these two kernels into one (synchronization)
-    prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left,
-                                                                                                tmp_upper_right,
-                                                                                                tmp_lower_left,
-                                                                                                tmp_lower_right,
-                                                                                                u,
-                                                                                                forward_upper,
-                                                                                                forward_lower,
-                                                                                                neighbor_ids,
-                                                                                                large_sides,
-                                                                                                orientations)
-    prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                       tmp_lower_right, u, forward_upper, forward_lower,
-                                       neighbor_ids, large_sides, orientations;
-                                       kernel_configurator_3d(prolong_mortars_large2small_kernel,
-                                                              size(u_upper_left, 2),
-                                                              size(u_upper_left, 3)^2,
-                                                              size(u_upper_left, 5))...)
-
     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left,
                                                                                                 u_upper_right,
                                                                                                 u_lower_left,
@@ -1918,16 +2205,19 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG
                                                                                                 tmp_upper_right,
                                                                                                 tmp_lower_left,
                                                                                                 tmp_lower_right,
-                                                                                                forward_upper,
+                                                                                                u, forward_upper,
                                                                                                 forward_lower,
-                                                                                                large_sides)
+                                                                                                neighbor_ids,
+                                                                                                large_sides,
+                                                                                                orientations)
     prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
                                        tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                       tmp_lower_right, forward_upper, forward_lower, large_sides;
-                                       kernel_configurator_3d(prolong_mortars_large2small_kernel,
-                                                              size(u_upper_left, 2),
-                                                              size(u_upper_left, 3)^2,
-                                                              size(u_upper_left, 5))...)
+                                       tmp_lower_right, u, forward_upper, forward_lower, neighbor_ids,
+                                       large_sides, orientations; cooperative = true,
+                                       kernel_configurator_coop_3d(prolong_mortars_large2small_kernel,
+                                                                   size(u_upper_left, 2),
+                                                                   size(u_upper_left, 3)^2,
+                                                                   size(u_upper_left, 5))...)
 
     return nothing
 end
@@ -1938,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat
     @assert iszero(length(cache.mortars.orientations))
 end
 
-# Pack kernels for calculating mortar fluxes
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, equations,
+#                                                                 surface_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        equations, surface_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
+# Pack kernels for calculating mortar fluxes (optimized)
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
                            equations, dg::DGSEM, cache)
     surface_flux = dg.surface_integral.surface_flux
@@ -1994,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2013,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end
 
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux, nonconservative_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, large_sides,
+#                                                                 equations, surface_flux,
+#                                                                 nonconservative_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        large_sides, equations, surface_flux, nonconservative_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
 # Pack kernels for calculating mortar fluxes
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
                            equations, dg::DGSEM, cache)
@@ -2105,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2124,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end