From 34b86b73042e87d82f8bc2584ab04864c16a8b2d Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Fri, 20 Dec 2024 19:03:21 -1000
Subject: [PATCH 1/3] Cooperative kernels

---
 benchmark.jl                   |  83 ++++++
 src/auxiliary/configurators.jl |  81 +++++-
 src/solvers/dg_3d.jl           | 462 +++++++++++++++++++++++----------
 test.jl                        |  48 ++++
 test2.jl                       |  97 +++++++
 5 files changed, 627 insertions(+), 144 deletions(-)
 create mode 100644 benchmark.jl
 create mode 100644 test.jl
 create mode 100644 test2.jl

diff --git a/benchmark.jl b/benchmark.jl
new file mode 100644
index 00000000..b401eca3
--- /dev/null
+++ b/benchmark.jl
@@ -0,0 +1,83 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+equations = CompressibleEulerEquations3D(1.4)
+
+initial_condition = initial_condition_convergence_test
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (0.0, 0.0, 0.0)
+coordinates_max = (2.0, 2.0, 2.0)
+refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5),
+                       coordinates_max = (1.5, 1.5, 1.5)),)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                refinement_patches = refinement_patches,
+                n_cells_max = 10_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms_convergence_test)
+semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
+                                           source_terms = source_terms_convergence_test)
+
+tspan = tspan_gpu = (0.0, 1.0)
+t = t_gpu = 0.0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu = copy(ode_gpu.u0)
+du_gpu = similar(u_gpu)
+
+# Semidiscretization process
+# Reset du test
+TrixiCUDA.reset_du!(du_gpu)
+
+# Volume integral test
+TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                Trixi.have_nonconservative_terms(equations_gpu),
+                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                cache_gpu)
+
+# Prolong to interfaces test
+TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+
+# Interface flux test
+TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+                               Trixi.have_nonconservative_terms(equations_gpu),
+                               equations_gpu, solver_gpu, cache_gpu)
+
+# Prolong to boundaries test
+TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+                                   equations_gpu, cache_gpu)
+
+# Boundary flux test
+TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+                              Trixi.have_nonconservative_terms(equations_gpu),
+                              equations_gpu, solver_gpu, cache_gpu)
+
+# Prolong to mortars test
+@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+                                                      TrixiCUDA.check_cache_mortars(cache_gpu),
+                                                      solver_gpu, cache_gpu)
+
+# # Mortar flux test
+# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+#                             Trixi.have_nonconservative_terms(equations_gpu),
+#                             equations_gpu, solver_gpu, cache_gpu)
diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl
index c2e04a75..813576a9 100644
--- a/src/auxiliary/configurators.jl
+++ b/src/auxiliary/configurators.jl
@@ -2,7 +2,7 @@
 # blocks to be used in the kernel, which optimizes the use of GPU resources.
 
 # 1D kernel configurator
-# We hardcode 32 threads per block for 1D kernels
+# We hardcode 32 threads per block for 1D kernels.
 function kernel_configurator_1d(kernel::HostKernel, x::Int)
     # config = launch_configuration(kernel.fun) # not used in this case
 
@@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 1D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_1d(kernel::HostKernel, x::Int)
+    # config = launch_configuration(kernel.fun) # not used in this case
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32
+    blocks = min(cld(x, threads), sm_count)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 2D kernel configurator
 # We hardcode 32 threads for x dimension per block, and y dimension is determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 2D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+
+    blocks = (blocks_x, blocks_y)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # 3D kernel configurator
 # We hardcode 32 threads for x dimension per block, y and z dimensions are determined 
-# by the number of threads returned by the launch configuration
+# by the number of threads returned by the launch configuration.
 function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     config = launch_configuration(kernel.fun) # get the number of threads
 
@@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
     return (threads = threads, blocks = blocks)
 end
 
+# 3D kernel configurator for cooperative launch
+# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, 
+# so we need to query the SM count first. Also, kernels launched with cooperative launch have 
+# to use stride loops to handle the constrained launch size.
+function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int)
+    config = launch_configuration(kernel.fun) # get the number of threads
+    # Maybe pack properties into a struct
+    device = CUDA.device()
+    sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs
+
+    # y dimension
+    dims_y1 = cld(x * y, 32)
+    dims_y2 = max(fld(config.threads, 32), 1)
+
+    dims_y = min(dims_y1, dims_y2)
+
+    # z dimension
+    dims_z1 = cld(x * y * z, 32 * dims_y)
+    dims_z2 = max(fld(config.threads, 32 * dims_y), 1)
+
+    dims_z = min(dims_z1, dims_z2)
+
+    # x dimension is hardcoded to warp size 32
+    threads = (32, dims_y, dims_z)
+    blocks_x = cld(x, threads[1])
+    blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x))
+    blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y))
+
+    blocks = (blocks_x, blocks_y, blocks_z)
+
+    return (threads = threads, blocks = blocks)
+end
+
 # Deprecated old kernel configurators below
 
 # function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1})
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index e14df20f..54ed7be5 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -890,126 +890,248 @@ function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lowe
     return nothing
 end
 
-# Kernel for interpolating data large to small on mortars - step 1
-function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                             tmp_lower_right, u, forward_upper,
-                                             forward_lower, neighbor_ids, large_sides, orientations)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 &&
-        k <= size(tmp_upper_left, 5))
-        u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
-
-        j1 = div(j - 1, u2) + 1
-        j2 = rem(j - 1, u2) + 1
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-        large_element = neighbor_ids[5, k]
-
-        leftright = large_side
-
-        @inbounds begin
-            for j1j1 in axes(forward_lower, 2)
-                tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                             large_element] * (2 - large_side)
-
-                tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                              large_element] * (2 - large_side)
-
-                tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                             large_element] * (2 - large_side)
-
-                tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
-                                                              large_element] * (2 - large_side)
-            end
-
-            for j1j1 in axes(forward_lower, 2)
-                tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                             large_element] * (large_side - 1)
-
-                tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                              large_element] * (large_side - 1)
-
-                tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
-                                                           u[i,
-                                                             isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                             isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                             isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                             large_element] * (large_side - 1)
-
-                tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
-                                                            u[i,
-                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
-                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
-                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
-                                                              large_element] * (large_side - 1)
-            end
-        end
-    end
-
-    return nothing
-end
-
-# Kernel for interpolating data large to small on mortars - step 2
-function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left,
-                                             u_lower_right, tmp_upper_left, tmp_upper_right,
-                                             tmp_lower_left, tmp_lower_right, forward_upper,
-                                             forward_lower, large_sides)
+# # Kernel for interpolating data large to small on mortars - step 1
+# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                              tmp_lower_right, u, forward_upper,
+#                                              forward_lower, neighbor_ids, large_sides, orientations)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 &&
+#         k <= size(tmp_upper_left, 5))
+#         u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
+
+#         j1 = div(j - 1, u2) + 1
+#         j2 = rem(j - 1, u2) + 1
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+#         large_element = neighbor_ids[5, k]
+
+#         leftright = large_side
+
+#         @inbounds begin
+#             for j1j1 in axes(forward_lower, 2)
+#                 tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                              large_element] * (2 - large_side)
+
+#                 tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                               large_element] * (2 - large_side)
+
+#                 tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                              large_element] * (2 - large_side)
+
+#                 tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+#                                                               large_element] * (2 - large_side)
+#             end
+
+#             for j1j1 in axes(forward_lower, 2)
+#                 tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                              large_element] * (large_side - 1)
+
+#                 tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                               large_element] * (large_side - 1)
+
+#                 tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+#                                                            u[i,
+#                                                              isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                              isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                              isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                              large_element] * (large_side - 1)
+
+#                 tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+#                                                             u[i,
+#                                                               isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+#                                                               isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+#                                                               isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3),
+#                                                               large_element] * (large_side - 1)
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for interpolating data large to small on mortars - step 2
+# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left,
+#                                              u_lower_right, tmp_upper_left, tmp_upper_right,
+#                                              tmp_lower_left, tmp_lower_right, forward_upper,
+#                                              forward_lower, large_sides)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 &&
+#         k <= size(u_upper_left, 5))
+#         u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2)
+
+#         j1 = div(j - 1, u2) + 1
+#         j2 = rem(j - 1, u2) + 1
+
+#         leftright = large_sides[k]
+
+#         @inbounds begin
+#             for j2j2 in axes(forward_upper, 2)
+#                 u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+#                                                          tmp_upper_left[leftright, i, j1, j2j2, k]
+
+#                 u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+#                                                           tmp_upper_right[leftright, i, j1, j2j2, k]
+
+#                 u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+#                                                          tmp_lower_left[leftright, i, j1, j2j2, k]
+
+#                 u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+#                                                           tmp_lower_right[leftright, i, j1, j2j2, k]
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# Kernel for interpolating data large to small on mortars
+function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
+                                             tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right,
+                                             u, forward_upper, forward_lower, neighbor_ids, large_sides,
+                                             orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 &&
-        k <= size(u_upper_left, 5))
-        u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2)
-
-        j1 = div(j - 1, u2) + 1
-        j2 = rem(j - 1, u2) + 1
-
-        leftright = large_sides[k]
-
-        @inbounds begin
-            for j2j2 in axes(forward_upper, 2)
-                u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
-                                                         tmp_upper_left[leftright, i, j1, j2j2, k]
-
-                u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
-                                                          tmp_upper_right[leftright, i, j1, j2j2, k]
-
-                u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
-                                                         tmp_lower_left[leftright, i, j1, j2j2, k]
-
-                u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
-                                                          tmp_lower_right[leftright, i, j1, j2j2, k]
+    # Loop stride for each dimension
+    stride_x = gridDim().x * blockDim().x
+    stride_y = gridDim().y * blockDim().y
+    stride_z = gridDim().z * blockDim().z
+
+    # Cooperative kernel needs stride loops to handle the constrained launch size
+    while i <= size(tmp_upper_left, 2)
+        while j <= size(tmp_upper_left, 3)^2
+            while k <= size(tmp_upper_left, 5)
+                u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2)
+
+                j1 = div(j - 1, u2) + 1
+                j2 = rem(j - 1, u2) + 1
+
+                large_side = large_sides[k]
+                orientation = orientations[k]
+                large_element = neighbor_ids[5, k]
+
+                leftright = large_side
+
+                @inbounds begin
+                    for j1j1 in axes(forward_lower, 2)
+                        tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                     large_element] * (2 - large_side)
+
+                        tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                      large_element] * (2 - large_side)
+
+                        tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                     large_element] * (2 - large_side)
+
+                        tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2,
+                                                                      large_element] * (2 - large_side)
+                    end
+
+                    for j1j1 in axes(forward_lower, 2)
+                        tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                           3),
+                                                                     large_element] * (large_side - 1)
+
+                        tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                            3),
+                                                                      large_element] * (large_side - 1)
+
+                        tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] *
+                                                                   u[i,
+                                                                     isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                     isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                     isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                           3),
+                                                                     large_element] * (large_side - 1)
+
+                        tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] *
+                                                                    u[i,
+                                                                      isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1,
+                                                                      isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2,
+                                                                      isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation,
+                                                                                                                                            3),
+                                                                      large_element] * (large_side - 1)
+                    end
+
+                    # Grid scope synchronization
+                    grid = CG.this_grid()
+                    CG.sync(grid)
+
+                    for j2j2 in axes(forward_upper, 2)
+                        u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+                                                                 tmp_upper_left[leftright, i, j1, j2j2, k]
+
+                        u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] *
+                                                                  tmp_upper_right[leftright, i, j1, j2j2, k]
+
+                        u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+                                                                 tmp_lower_left[leftright, i, j1, j2j2, k]
+
+                        u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] *
+                                                                  tmp_lower_right[leftright, i, j1, j2j2, k]
+                    end
+                end
+                k += stride_z
             end
+            j += stride_y
         end
+        i += stride_x
     end
 
     return nothing
@@ -1857,7 +1979,81 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D
     @assert iszero(length(cache.mortars.orientations))
 end
 
-# Pack kernels for prolonging solution to mortars
+# # Pack kernels for prolonging solution to mortars
+# function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     forward_upper = CuArray(dg.mortar.forward_upper)
+#     forward_lower = CuArray(dg.mortar.forward_lower)
+
+#     prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper_left,
+#                                                                                                 u_upper_right,
+#                                                                                                 u_lower_left,
+#                                                                                                 u_lower_right,
+#                                                                                                 u,
+#                                                                                                 neighbor_ids,
+#                                                                                                 large_sides,
+#                                                                                                 orientations)
+#     prolong_mortars_small2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, u,
+#                                        neighbor_ids, large_sides, orientations;
+#                                        kernel_configurator_3d(prolong_mortars_small2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     tmp_upper_left = zero(similar(u_upper_left)) # undef to zero
+#     tmp_upper_right = zero(similar(u_upper_right)) # undef to zero
+#     tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
+#     tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
+
+#     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left,
+#                                                                                                 tmp_upper_right,
+#                                                                                                 tmp_lower_left,
+#                                                                                                 tmp_lower_right,
+#                                                                                                 u,
+#                                                                                                 forward_upper,
+#                                                                                                 forward_lower,
+#                                                                                                 neighbor_ids,
+#                                                                                                 large_sides,
+#                                                                                                 orientations)
+#     prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                        tmp_lower_right, u, forward_upper, forward_lower,
+#                                        neighbor_ids, large_sides, orientations;
+#                                        kernel_configurator_3d(prolong_mortars_large2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left,
+#                                                                                                 u_upper_right,
+#                                                                                                 u_lower_left,
+#                                                                                                 u_lower_right,
+#                                                                                                 tmp_upper_left,
+#                                                                                                 tmp_upper_right,
+#                                                                                                 tmp_lower_left,
+#                                                                                                 tmp_lower_right,
+#                                                                                                 forward_upper,
+#                                                                                                 forward_lower,
+#                                                                                                 large_sides)
+#     prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
+#                                        tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                        tmp_lower_right, forward_upper, forward_lower, large_sides;
+#                                        kernel_configurator_3d(prolong_mortars_large2small_kernel,
+#                                                               size(u_upper_left, 2),
+#                                                               size(u_upper_left, 3)^2,
+#                                                               size(u_upper_left, 5))...)
+
+#     return nothing
+# end
+
+# Pack kernels for prolonging solution to mortars (optimized)
 function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
     neighbor_ids = cache.mortars.neighbor_ids
     large_sides = cache.mortars.large_sides
@@ -1891,25 +2087,6 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG
     tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
     tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
 
-    # TODO: Combine these two kernels into one (synchronization)
-    prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left,
-                                                                                                tmp_upper_right,
-                                                                                                tmp_lower_left,
-                                                                                                tmp_lower_right,
-                                                                                                u,
-                                                                                                forward_upper,
-                                                                                                forward_lower,
-                                                                                                neighbor_ids,
-                                                                                                large_sides,
-                                                                                                orientations)
-    prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                       tmp_lower_right, u, forward_upper, forward_lower,
-                                       neighbor_ids, large_sides, orientations;
-                                       kernel_configurator_3d(prolong_mortars_large2small_kernel,
-                                                              size(u_upper_left, 2),
-                                                              size(u_upper_left, 3)^2,
-                                                              size(u_upper_left, 5))...)
-
     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left,
                                                                                                 u_upper_right,
                                                                                                 u_lower_left,
@@ -1918,16 +2095,19 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG
                                                                                                 tmp_upper_right,
                                                                                                 tmp_lower_left,
                                                                                                 tmp_lower_right,
-                                                                                                forward_upper,
+                                                                                                u, forward_upper,
                                                                                                 forward_lower,
-                                                                                                large_sides)
+                                                                                                neighbor_ids,
+                                                                                                large_sides,
+                                                                                                orientations)
     prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right,
                                        tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                                       tmp_lower_right, forward_upper, forward_lower, large_sides;
-                                       kernel_configurator_3d(prolong_mortars_large2small_kernel,
-                                                              size(u_upper_left, 2),
-                                                              size(u_upper_left, 3)^2,
-                                                              size(u_upper_left, 5))...)
+                                       tmp_lower_right, u, forward_upper, forward_lower, neighbor_ids,
+                                       large_sides, orientations; cooperative = true,
+                                       kernel_configurator_coop_3d(prolong_mortars_large2small_kernel,
+                                                                   size(u_upper_left, 2),
+                                                                   size(u_upper_left, 3)^2,
+                                                                   size(u_upper_left, 5))...)
 
     return nothing
 end
diff --git a/test.jl b/test.jl
new file mode 100644
index 00000000..95ad959f
--- /dev/null
+++ b/test.jl
@@ -0,0 +1,48 @@
+# using CUDA
+
+# # Kernel using CG.this_grid() for grid-level synchronization
+# function simple_kernel(data)
+#     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+#     # Simple operation: increment each element
+#     if idx <= length(data)
+#         data[idx] += 1.0f0
+#     end
+
+#     grid = CG.this_grid()
+#     CG.sync(grid)
+
+#     return nothing
+# end
+
+# # Host code
+# function main()
+#     n = 1024
+#     data = CUDA.fill(Float32(1.0), n)  # Initialize array with 1.0s
+
+#     # Kernel launch configuration
+#     threads_per_block = 10
+#     num_blocks = ceil(Int, n / threads_per_block)
+
+#     println("num_blocks: ", num_blocks)
+
+#     # Launch kernel with cooperative launch enabled
+#     kernel = @cuda launch=false simple_kernel(data)
+
+#     kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true)
+
+#     # Fetch results back to host
+#     result = Array(data)
+# end
+
+# main()
+
+using CUDA
+
+function query_max_threads_per_block()
+    device = CUDA.device()  # Get the current CUDA device
+    max_threads = CUDA.capability(device).max_threads_per_block  # Query max threads per block
+    println("Maximum threads per block: $max_threads")
+end
+
+query_max_threads_per_block()
diff --git a/test2.jl b/test2.jl
new file mode 100644
index 00000000..ab044f24
--- /dev/null
+++ b/test2.jl
@@ -0,0 +1,97 @@
+using Trixi, TrixiCUDA
+using CUDA
+using Test
+using BenchmarkTools
+
+include("test/test_macros.jl")
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+initial_condition = initial_condition_convergence_test
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
+                       coordinates_max = (1.0, 1.0, 1.0)),
+                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
+                       coordinates_max = (0.5, 0.5, 0.5)))
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                refinement_patches = refinement_patches,
+                n_cells_max = 10_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+
+tspan = tspan_gpu = (0.0, 5.0)
+t = t_gpu = 0.0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu = copy(ode_gpu.u0)
+du_gpu = similar(u_gpu)
+
+# Tests for components initialization
+@test_approx (u_gpu, u)
+# du is initlaizaed as undefined, cannot test now
+
+# Tests for semidiscretization process
+TrixiCUDA.reset_du!(du_gpu)
+Trixi.reset_du!(du, solver, cache)
+@test_approx (du_gpu, du)
+
+TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                Trixi.have_nonconservative_terms(equations_gpu),
+                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                cache_gpu)
+Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+                            equations, solver.volume_integral, solver, cache)
+@test_approx (du_gpu, du)
+
+TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+@test_approx (cache_gpu.interfaces.u, cache.interfaces.u)
+
+TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+                               Trixi.have_nonconservative_terms(equations_gpu),
+                               equations_gpu, solver_gpu, cache_gpu)
+Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+                           Trixi.have_nonconservative_terms(equations), equations,
+                           solver.surface_integral, solver, cache)
+@test_approx (cache_gpu.elements.surface_flux_values,
+              cache.elements.surface_flux_values)
+
+TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+                                   equations_gpu, cache_gpu)
+Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+@test_approx (cache_gpu.boundaries.u, cache.boundaries.u)
+
+TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+                              Trixi.have_nonconservative_terms(equations_gpu),
+                              equations_gpu, solver_gpu, cache_gpu)
+Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+                          solver.surface_integral, solver)
+@test_approx (cache_gpu.elements.surface_flux_values,
+              cache.elements.surface_flux_values)
+
+TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+                                TrixiCUDA.check_cache_mortars(cache_gpu),
+                                solver_gpu, cache_gpu)

From 0a454c9c6ef885eaf72afa9d3598920645909b3e Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Fri, 20 Dec 2024 22:18:47 -1000
Subject: [PATCH 2/3] Complete

---
 benchmark.jl         |  38 +--
 src/solvers/dg_2d.jl |  12 +-
 src/solvers/dg_3d.jl | 607 +++++++++++++++++++++++++++++++------------
 test.jl              |  48 ----
 test2.jl             |  97 -------
 5 files changed, 472 insertions(+), 330 deletions(-)
 delete mode 100644 test.jl
 delete mode 100644 test2.jl

diff --git a/benchmark.jl b/benchmark.jl
index b401eca3..36a1c228 100644
--- a/benchmark.jl
+++ b/benchmark.jl
@@ -2,24 +2,27 @@ using Trixi, TrixiCUDA
 using CUDA
 using BenchmarkTools
 
-equations = CompressibleEulerEquations3D(1.4)
+equations = IdealGlmMhdEquations3D(5 / 3)
 
 initial_condition = initial_condition_convergence_test
-solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
 
-coordinates_min = (0.0, 0.0, 0.0)
-coordinates_max = (2.0, 2.0, 2.0)
-refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5),
-                       coordinates_max = (1.5, 1.5, 1.5)),)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+solver = DGSEM(polydeg = 3,
+               surface_flux = (flux_hlle,
+                               flux_nonconservative_powell),
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5),
+                       coordinates_max = (0.5, 0.5, 0.5)),)
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 2,
                 refinement_patches = refinement_patches,
                 n_cells_max = 10_000)
 
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
-                                    source_terms = source_terms_convergence_test)
-semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
-                                           source_terms = source_terms_convergence_test)
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
 
 tspan = tspan_gpu = (0.0, 1.0)
 t = t_gpu = 0.0
@@ -73,11 +76,10 @@ TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
                               equations_gpu, solver_gpu, cache_gpu)
 
 # Prolong to mortars test
-@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                                      TrixiCUDA.check_cache_mortars(cache_gpu),
-                                                      solver_gpu, cache_gpu)
-
-# # Mortar flux test
-# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
-#                             Trixi.have_nonconservative_terms(equations_gpu),
-#                             equations_gpu, solver_gpu, cache_gpu)
+TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+                                TrixiCUDA.check_cache_mortars(cache_gpu),
+                                solver_gpu, cache_gpu)
+
+@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+                                                  Trixi.have_nonconservative_terms(equations_gpu),
+                                                  equations_gpu, solver_gpu, cache_gpu)
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index de0643eb..e7cae9f8 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value
                                      fstar_primary_upper, fstar_primary_lower,
                                      fstar_secondary_upper, fstar_secondary_lower,
                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{2})
+                                     orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
@@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 54ed7be5..8bdf8468 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -1272,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right
 end
 
 # # Kernel for copying mortar fluxes small to small and small to large - step 1
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
-                                     tmp_lower_left, tmp_lower_right,
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
+#                                      tmp_lower_left, tmp_lower_right,
+#                                      fstar_primary_upper_left, fstar_primary_upper_right,
+#                                      fstar_primary_lower_left, fstar_primary_lower_right,
+#                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                      fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         lower_left_element = neighbor_ids[1, k]
+#         lower_right_element = neighbor_ids[2, k]
+#         upper_left_element = neighbor_ids[3, k]
+#         upper_right_element = neighbor_ids[4, k]
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation + large_side - 2
+
+#         surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j1j1 in axes(reverse_upper, 2)
+#                 tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_upper_left[i, j1j1, j2, k]
+#                 tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_upper_right[i, j1j1, j2, k]
+#                 tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_lower_left[i, j1j1, j2, k]
+#                 tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_lower_right[i, j1j1, j2, k]
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for copying mortar fluxes small to small and small to large - step 2
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                      tmp_upper_right, tmp_lower_left, tmp_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations, equations::AbstractEquations{3})
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # See step 1 for the explanation of the following expression
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j2j2 in axes(reverse_lower, 2)
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#             end
+
+#             surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#         end
+#     end
+
+#     return nothing
+# end
+
+# Kernel for copying mortar fluxes small to small and small to large
+function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values,
+                                     tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right,
                                      fstar_primary_upper_left, fstar_primary_upper_right,
                                      fstar_primary_lower_left, fstar_primary_lower_right,
                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
@@ -1284,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
-
-        lower_left_element = neighbor_ids[1, k]
-        lower_right_element = neighbor_ids[2, k]
-        upper_left_element = neighbor_ids[3, k]
-        upper_right_element = neighbor_ids[4, k]
-        large_element = neighbor_ids[5, k]
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation + large_side - 2
-
-        surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation - large_side + 1
+    # Loop stride for each dimension
+    stride_x = gridDim().x * blockDim().x
+    stride_y = gridDim().y * blockDim().y
+    stride_z = gridDim().z * blockDim().z
 
-        @inbounds begin
-            for j1j1 in axes(reverse_upper, 2)
-                tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_upper_left[i, j1j1, j2, k]
-                tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_upper_right[i, j1j1, j2, k]
-                tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_lower_left[i, j1j1, j2, k]
-                tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_lower_right[i, j1j1, j2, k]
-            end
-        end
-    end
+    # Cooperative kernel needs stride loops to handle the constrained launch size
+    while i <= size(surface_flux_values, 1)
+        while j <= size(surface_flux_values, 2)^2
+            while k <= length(orientations)
+                j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+                j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+                lower_left_element = neighbor_ids[1, k]
+                lower_right_element = neighbor_ids[2, k]
+                upper_left_element = neighbor_ids[3, k]
+                upper_right_element = neighbor_ids[4, k]
+                large_element = neighbor_ids[5, k]
 
-    return nothing
-end
+                large_side = large_sides[k]
+                orientation = orientations[k]
 
-# Kernel for copying mortar fluxes small to small and small to large - step 2
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                                     tmp_upper_right, tmp_lower_left, tmp_lower_right,
-                                     reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{3})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation + large_side - 2
+
+                surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation - large_side + 1
 
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+                @inbounds begin
+                    for j1j1 in axes(reverse_upper, 2)
+                        tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_upper_left[i, j1j1, j2, k]
+                        tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_upper_right[i, j1j1, j2, k]
+                        tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_lower_left[i, j1j1, j2, k]
+                        tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_lower_right[i, j1j1, j2, k]
+                    end
+                end
 
-        large_element = neighbor_ids[5, k]
+                # Grid scope synchronization
+                grid = CG.this_grid()
+                CG.sync(grid)
 
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # See step 1 for the explanation of the following expression
-        direction = 2 * orientation - large_side + 1
+                @inbounds begin
+                    for j2j2 in axes(reverse_lower, 2)
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                    end
 
-        @inbounds begin
-            for j2j2 in axes(reverse_lower, 2)
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
+                    surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                end
+                k += stride_z
             end
-
-            surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
-                                                                                               direction,
-                                                                                               large_element]
+            j += stride_y
         end
+        i += stride_x
     end
 
     return nothing
@@ -2118,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat
     @assert iszero(length(cache.mortars.orientations))
 end
 
-# Pack kernels for calculating mortar fluxes
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, equations,
+#                                                                 surface_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        equations, surface_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
+# Pack kernels for calculating mortar fluxes (optimized)
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
                            equations, dg::DGSEM, cache)
     surface_flux = dg.surface_integral.surface_flux
@@ -2174,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2193,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end
 
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux, nonconservative_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, large_sides,
+#                                                                 equations, surface_flux,
+#                                                                 nonconservative_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        large_sides, equations, surface_flux, nonconservative_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
 # Pack kernels for calculating mortar fluxes
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
                            equations, dg::DGSEM, cache)
@@ -2285,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2304,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end
diff --git a/test.jl b/test.jl
deleted file mode 100644
index 95ad959f..00000000
--- a/test.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-# using CUDA
-
-# # Kernel using CG.this_grid() for grid-level synchronization
-# function simple_kernel(data)
-#     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-#     # Simple operation: increment each element
-#     if idx <= length(data)
-#         data[idx] += 1.0f0
-#     end
-
-#     grid = CG.this_grid()
-#     CG.sync(grid)
-
-#     return nothing
-# end
-
-# # Host code
-# function main()
-#     n = 1024
-#     data = CUDA.fill(Float32(1.0), n)  # Initialize array with 1.0s
-
-#     # Kernel launch configuration
-#     threads_per_block = 10
-#     num_blocks = ceil(Int, n / threads_per_block)
-
-#     println("num_blocks: ", num_blocks)
-
-#     # Launch kernel with cooperative launch enabled
-#     kernel = @cuda launch=false simple_kernel(data)
-
-#     kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true)
-
-#     # Fetch results back to host
-#     result = Array(data)
-# end
-
-# main()
-
-using CUDA
-
-function query_max_threads_per_block()
-    device = CUDA.device()  # Get the current CUDA device
-    max_threads = CUDA.capability(device).max_threads_per_block  # Query max threads per block
-    println("Maximum threads per block: $max_threads")
-end
-
-query_max_threads_per_block()
diff --git a/test2.jl b/test2.jl
deleted file mode 100644
index ab044f24..00000000
--- a/test2.jl
+++ /dev/null
@@ -1,97 +0,0 @@
-using Trixi, TrixiCUDA
-using CUDA
-using Test
-using BenchmarkTools
-
-include("test/test_macros.jl")
-
-advection_velocity = (0.2, -0.7, 0.5)
-equations = LinearScalarAdvectionEquation3D(advection_velocity)
-
-initial_condition = initial_condition_convergence_test
-solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
-
-coordinates_min = (-1.0, -1.0, -1.0)
-coordinates_max = (1.0, 1.0, 1.0)
-refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
-                       coordinates_max = (1.0, 1.0, 1.0)),
-                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
-                       coordinates_max = (0.5, 0.5, 0.5)))
-mesh = TreeMesh(coordinates_min, coordinates_max,
-                initial_refinement_level = 2,
-                refinement_patches = refinement_patches,
-                n_cells_max = 10_000)
-
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
-semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
-
-tspan = tspan_gpu = (0.0, 5.0)
-t = t_gpu = 0.0
-
-# Semi on CPU
-(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
-
-# Semi on GPU
-equations_gpu = semi_gpu.equations
-mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
-boundary_conditions_gpu = semi_gpu.boundary_conditions
-source_terms_gpu = semi_gpu.source_terms
-
-# ODE on CPU
-ode = semidiscretize(semi, tspan)
-u_ode = copy(ode.u0)
-du_ode = similar(u_ode)
-u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
-du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
-
-# ODE on GPU
-ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
-u_gpu = copy(ode_gpu.u0)
-du_gpu = similar(u_gpu)
-
-# Tests for components initialization
-@test_approx (u_gpu, u)
-# du is initlaizaed as undefined, cannot test now
-
-# Tests for semidiscretization process
-TrixiCUDA.reset_du!(du_gpu)
-Trixi.reset_du!(du, solver, cache)
-@test_approx (du_gpu, du)
-
-TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                Trixi.have_nonconservative_terms(equations_gpu),
-                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                cache_gpu)
-Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                            equations, solver.volume_integral, solver, cache)
-@test_approx (du_gpu, du)
-
-TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-@test_approx (cache_gpu.interfaces.u, cache.interfaces.u)
-
-TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                               Trixi.have_nonconservative_terms(equations_gpu),
-                               equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                           Trixi.have_nonconservative_terms(equations), equations,
-                           solver.surface_integral, solver, cache)
-@test_approx (cache_gpu.elements.surface_flux_values,
-              cache.elements.surface_flux_values)
-
-TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                   equations_gpu, cache_gpu)
-Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-@test_approx (cache_gpu.boundaries.u, cache.boundaries.u)
-
-TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                              Trixi.have_nonconservative_terms(equations_gpu),
-                              equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                          solver.surface_integral, solver)
-@test_approx (cache_gpu.elements.surface_flux_values,
-              cache.elements.surface_flux_values)
-
-TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                TrixiCUDA.check_cache_mortars(cache_gpu),
-                                solver_gpu, cache_gpu)

From a01f7a3923854fcc3358cc54001d3466299e00ae Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Fri, 20 Dec 2024 22:29:34 -1000
Subject: [PATCH 3/3] Remove benchmark

---
 benchmark.jl | 85 ----------------------------------------------------
 1 file changed, 85 deletions(-)
 delete mode 100644 benchmark.jl

diff --git a/benchmark.jl b/benchmark.jl
deleted file mode 100644
index 36a1c228..00000000
--- a/benchmark.jl
+++ /dev/null
@@ -1,85 +0,0 @@
-using Trixi, TrixiCUDA
-using CUDA
-using BenchmarkTools
-
-equations = IdealGlmMhdEquations3D(5 / 3)
-
-initial_condition = initial_condition_convergence_test
-
-volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
-solver = DGSEM(polydeg = 3,
-               surface_flux = (flux_hlle,
-                               flux_nonconservative_powell),
-               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
-
-coordinates_min = (-1.0, -1.0, -1.0)
-coordinates_max = (1.0, 1.0, 1.0)
-refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5),
-                       coordinates_max = (0.5, 0.5, 0.5)),)
-mesh = TreeMesh(coordinates_min, coordinates_max,
-                initial_refinement_level = 2,
-                refinement_patches = refinement_patches,
-                n_cells_max = 10_000)
-
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
-semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
-
-tspan = tspan_gpu = (0.0, 1.0)
-t = t_gpu = 0.0
-
-# Semi on CPU
-(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
-
-# Semi on GPU
-equations_gpu = semi_gpu.equations
-mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
-boundary_conditions_gpu = semi_gpu.boundary_conditions
-source_terms_gpu = semi_gpu.source_terms
-
-# ODE on CPU
-ode = semidiscretize(semi, tspan)
-u_ode = copy(ode.u0)
-du_ode = similar(u_ode)
-u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
-du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
-
-# ODE on GPU
-ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
-u_gpu = copy(ode_gpu.u0)
-du_gpu = similar(u_gpu)
-
-# Semidiscretization process
-# Reset du test
-TrixiCUDA.reset_du!(du_gpu)
-
-# Volume integral test
-TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                Trixi.have_nonconservative_terms(equations_gpu),
-                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                cache_gpu)
-
-# Prolong to interfaces test
-TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-
-# Interface flux test
-TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                               Trixi.have_nonconservative_terms(equations_gpu),
-                               equations_gpu, solver_gpu, cache_gpu)
-
-# Prolong to boundaries test
-TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                   equations_gpu, cache_gpu)
-
-# Boundary flux test
-TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                              Trixi.have_nonconservative_terms(equations_gpu),
-                              equations_gpu, solver_gpu, cache_gpu)
-
-# Prolong to mortars test
-TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                TrixiCUDA.check_cache_mortars(cache_gpu),
-                                solver_gpu, cache_gpu)
-
-@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
-                                                  Trixi.have_nonconservative_terms(equations_gpu),
-                                                  equations_gpu, solver_gpu, cache_gpu)