diff --git a/benchmark.jl b/benchmark.jl
index b401eca..36a1c22 100644
--- a/benchmark.jl
+++ b/benchmark.jl
@@ -2,24 +2,27 @@ using Trixi, TrixiCUDA
 using CUDA
 using BenchmarkTools
 
-equations = CompressibleEulerEquations3D(1.4)
+equations = IdealGlmMhdEquations3D(5 / 3)
 
 initial_condition = initial_condition_convergence_test
-solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
 
-coordinates_min = (0.0, 0.0, 0.0)
-coordinates_max = (2.0, 2.0, 2.0)
-refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5),
-                       coordinates_max = (1.5, 1.5, 1.5)),)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+solver = DGSEM(polydeg = 3,
+               surface_flux = (flux_hlle,
+                               flux_nonconservative_powell),
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5),
+                       coordinates_max = (0.5, 0.5, 0.5)),)
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 2,
                 refinement_patches = refinement_patches,
                 n_cells_max = 10_000)
 
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
-                                    source_terms = source_terms_convergence_test)
-semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
-                                           source_terms = source_terms_convergence_test)
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
 
 tspan = tspan_gpu = (0.0, 1.0)
 t = t_gpu = 0.0
@@ -73,11 +76,10 @@ TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
                               equations_gpu, solver_gpu, cache_gpu)
 
 # Prolong to mortars test
-@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                                      TrixiCUDA.check_cache_mortars(cache_gpu),
-                                                      solver_gpu, cache_gpu)
-
-# # Mortar flux test
-# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
-#                             Trixi.have_nonconservative_terms(equations_gpu),
-#                             equations_gpu, solver_gpu, cache_gpu)
+TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+                                TrixiCUDA.check_cache_mortars(cache_gpu),
+                                solver_gpu, cache_gpu)
+
+@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+                                                  Trixi.have_nonconservative_terms(equations_gpu),
+                                                  equations_gpu, solver_gpu, cache_gpu)
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index de0643e..e7cae9f 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value
                                      fstar_primary_upper, fstar_primary_lower,
                                      fstar_secondary_upper, fstar_secondary_lower,
                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{2})
+                                     orientations)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
@@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati
                                                                                 reverse_lower,
                                                                                 neighbor_ids,
                                                                                 large_sides,
-                                                                                orientations,
-                                                                                equations)
+                                                                                orientations)
     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper,
                                fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower,
                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations, equations;
+                               orientations;
                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
                                                       size(surface_flux_values, 1),
                                                       size(surface_flux_values, 2),
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 54ed7be..8bdf846 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -1272,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right
 end
 
 # # Kernel for copying mortar fluxes small to small and small to large - step 1
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
-                                     tmp_lower_left, tmp_lower_right,
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right,
+#                                      tmp_lower_left, tmp_lower_right,
+#                                      fstar_primary_upper_left, fstar_primary_upper_right,
+#                                      fstar_primary_lower_left, fstar_primary_lower_right,
+#                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                      fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         lower_left_element = neighbor_ids[1, k]
+#         lower_right_element = neighbor_ids[2, k]
+#         upper_left_element = neighbor_ids[3, k]
+#         upper_right_element = neighbor_ids[4, k]
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation + large_side - 2
+
+#         surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+#         surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+#         # Use simple math expression to enhance the performance (against control flow), 
+#         # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+#         #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+#         #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+#         # Please also check the original code in Trixi.jl when you modify this code.
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j1j1 in axes(reverse_upper, 2)
+#                 tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_upper_left[i, j1j1, j2, k]
+#                 tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_upper_right[i, j1j1, j2, k]
+#                 tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+#                                                                        fstar_secondary_lower_left[i, j1j1, j2, k]
+#                 tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+#                                                                         fstar_secondary_lower_right[i, j1j1, j2, k]
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for copying mortar fluxes small to small and small to large - step 2
+# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                      tmp_upper_right, tmp_lower_left, tmp_lower_right,
+#                                      reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                      orientations, equations::AbstractEquations{3})
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+#         k <= length(orientations))
+#         j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+#         j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+#         large_element = neighbor_ids[5, k]
+
+#         large_side = large_sides[k]
+#         orientation = orientations[k]
+
+#         # See step 1 for the explanation of the following expression
+#         direction = 2 * orientation - large_side + 1
+
+#         @inbounds begin
+#             for j2j2 in axes(reverse_lower, 2)
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+#                                                                                 tmp_upper_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_left[i, j1, j2j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#                 tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+#                                                                                 tmp_lower_right[i, j1, j2j2,
+#                                                                                                 direction,
+#                                                                                                 large_element]
+#             end
+
+#             surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+#                                                                                                direction,
+#                                                                                                large_element]
+#         end
+#     end
+
+#     return nothing
+# end
+
+# Kernel for copying mortar fluxes small to small and small to large
+function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values,
+                                     tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right,
                                      fstar_primary_upper_left, fstar_primary_upper_right,
                                      fstar_primary_lower_left, fstar_primary_lower_right,
                                      fstar_secondary_upper_left, fstar_secondary_upper_right,
@@ -1284,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
-
-        lower_left_element = neighbor_ids[1, k]
-        lower_right_element = neighbor_ids[2, k]
-        upper_left_element = neighbor_ids[3, k]
-        upper_right_element = neighbor_ids[4, k]
-        large_element = neighbor_ids[5, k]
-
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation + large_side - 2
-
-        surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
-        surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
-
-        # Use simple math expression to enhance the performance (against control flow), 
-        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
-        #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
-        #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
-        #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
-        #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
-        #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
-        # Please also check the original code in Trixi.jl when you modify this code.
-        direction = 2 * orientation - large_side + 1
+    # Loop stride for each dimension
+    stride_x = gridDim().x * blockDim().x
+    stride_y = gridDim().y * blockDim().y
+    stride_z = gridDim().z * blockDim().z
 
-        @inbounds begin
-            for j1j1 in axes(reverse_upper, 2)
-                tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_upper_left[i, j1j1, j2, k]
-                tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_upper_right[i, j1j1, j2, k]
-                tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
-                                                                       fstar_secondary_lower_left[i, j1j1, j2, k]
-                tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
-                                                                        fstar_secondary_lower_right[i, j1j1, j2, k]
-            end
-        end
-    end
+    # Cooperative kernel needs stride loops to handle the constrained launch size
+    while i <= size(surface_flux_values, 1)
+        while j <= size(surface_flux_values, 2)^2
+            while k <= length(orientations)
+                j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+                j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+                lower_left_element = neighbor_ids[1, k]
+                lower_right_element = neighbor_ids[2, k]
+                upper_left_element = neighbor_ids[3, k]
+                upper_right_element = neighbor_ids[4, k]
+                large_element = neighbor_ids[5, k]
 
-    return nothing
-end
+                large_side = large_sides[k]
+                orientation = orientations[k]
 
-# Kernel for copying mortar fluxes small to small and small to large - step 2
-function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                                     tmp_upper_right, tmp_lower_left, tmp_lower_right,
-                                     reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                                     orientations, equations::AbstractEquations{3})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation + large_side - 2
+
+                surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k]
+                surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k]
+
+                # Use simple math expression to enhance the performance (against control flow), 
+                # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+                #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+                #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+                #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+                #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+                #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+                # Please also check the original code in Trixi.jl when you modify this code.
+                direction = 2 * orientation - large_side + 1
 
-    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
-        k <= length(orientations))
-        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
-        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+                @inbounds begin
+                    for j1j1 in axes(reverse_upper, 2)
+                        tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_upper_left[i, j1j1, j2, k]
+                        tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_upper_right[i, j1j1, j2, k]
+                        tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                               fstar_secondary_lower_left[i, j1j1, j2, k]
+                        tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                                fstar_secondary_lower_right[i, j1j1, j2, k]
+                    end
+                end
 
-        large_element = neighbor_ids[5, k]
+                # Grid scope synchronization
+                grid = CG.this_grid()
+                CG.sync(grid)
 
-        large_side = large_sides[k]
-        orientation = orientations[k]
-
-        # See step 1 for the explanation of the following expression
-        direction = 2 * orientation - large_side + 1
+                @inbounds begin
+                    for j2j2 in axes(reverse_lower, 2)
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
+                                                                                        tmp_upper_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_left[i, j1, j2j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                        tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
+                                                                                        tmp_lower_right[i, j1, j2j2,
+                                                                                                        direction,
+                                                                                                        large_element]
+                    end
 
-        @inbounds begin
-            for j2j2 in axes(reverse_lower, 2)
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] *
-                                                                                tmp_upper_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_left[i, j1, j2j2,
-                                                                                               direction,
-                                                                                               large_element]
-                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] *
-                                                                                tmp_lower_right[i, j1, j2j2,
-                                                                                                direction,
-                                                                                                large_element]
+                    surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
+                                                                                                       direction,
+                                                                                                       large_element]
+                end
+                k += stride_z
             end
-
-            surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2,
-                                                                                               direction,
-                                                                                               large_element]
+            j += stride_y
         end
+        i += stride_x
     end
 
     return nothing
@@ -2118,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat
     @assert iszero(length(cache.mortars.orientations))
 end
 
-# Pack kernels for calculating mortar fluxes
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, equations,
+#                                                                 surface_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        equations, surface_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
+# Pack kernels for calculating mortar fluxes (optimized)
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
                            equations, dg::DGSEM, cache)
     surface_flux = dg.surface_integral.surface_flux
@@ -2174,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2193,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end
 
+# # Pack kernels for calculating mortar fluxes
+# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
+#                            equations, dg::DGSEM, cache)
+#     surface_flux, nonconservative_flux = dg.surface_integral.surface_flux
+
+#     neighbor_ids = cache.mortars.neighbor_ids
+#     large_sides = cache.mortars.large_sides
+#     orientations = cache.mortars.orientations
+
+#     # The original CPU arrays hold NaNs
+#     u_upper_left = cache.mortars.u_upper_left
+#     u_upper_right = cache.mortars.u_upper_right
+#     u_lower_left = cache.mortars.u_lower_left
+#     u_lower_right = cache.mortars.u_lower_right
+#     reverse_upper = CuArray(dg.mortar.reverse_upper)
+#     reverse_lower = CuArray(dg.mortar.reverse_lower)
+
+#     surface_flux_values = cache.elements.surface_flux_values
+#     tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+#     fstar_primary_upper_left = cache.fstar_primary_upper_left
+#     fstar_primary_upper_right = cache.fstar_primary_upper_right
+#     fstar_primary_lower_left = cache.fstar_primary_lower_left
+#     fstar_primary_lower_right = cache.fstar_primary_lower_right
+#     fstar_secondary_upper_left = cache.fstar_secondary_upper_left
+#     fstar_secondary_upper_right = cache.fstar_secondary_upper_right
+#     fstar_secondary_lower_left = cache.fstar_secondary_lower_left
+#     fstar_secondary_lower_right = cache.fstar_secondary_lower_right
+
+#     mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left,
+#                                                                 fstar_primary_upper_right,
+#                                                                 fstar_primary_lower_left,
+#                                                                 fstar_primary_lower_right,
+#                                                                 fstar_secondary_upper_left,
+#                                                                 fstar_secondary_upper_right,
+#                                                                 fstar_secondary_lower_left,
+#                                                                 fstar_secondary_lower_right,
+#                                                                 u_upper_left, u_upper_right,
+#                                                                 u_lower_left, u_lower_right,
+#                                                                 orientations, large_sides,
+#                                                                 equations, surface_flux,
+#                                                                 nonconservative_flux)
+#     mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right,
+#                        fstar_primary_lower_left, fstar_primary_lower_right,
+#                        fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                        fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                        u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+#                        large_sides, equations, surface_flux, nonconservative_flux;
+#                        kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3),
+#                                               size(u_upper_left, 4),
+#                                               length(orientations))...)
+
+#     tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+#     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 fstar_primary_upper_left,
+#                                                                                 fstar_primary_upper_right,
+#                                                                                 fstar_primary_lower_left,
+#                                                                                 fstar_primary_lower_right,
+#                                                                                 fstar_secondary_upper_left,
+#                                                                                 fstar_secondary_upper_right,
+#                                                                                 fstar_secondary_lower_left,
+#                                                                                 fstar_secondary_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
+#                                tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+#                                fstar_primary_lower_left, fstar_primary_lower_right,
+#                                fstar_secondary_upper_left, fstar_secondary_upper_right,
+#                                fstar_secondary_lower_left, fstar_secondary_lower_right,
+#                                reverse_upper, reverse_lower, neighbor_ids, large_sides,
+#                                orientations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+#                                                                                 tmp_surface_flux_values,
+#                                                                                 tmp_upper_left,
+#                                                                                 tmp_upper_right,
+#                                                                                 tmp_lower_left,
+#                                                                                 tmp_lower_right,
+#                                                                                 reverse_upper,
+#                                                                                 reverse_lower,
+#                                                                                 neighbor_ids,
+#                                                                                 large_sides,
+#                                                                                 orientations,
+#                                                                                 equations)
+#     mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+#                                tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
+#                                reverse_lower, neighbor_ids, large_sides, orientations, equations;
+#                                kernel_configurator_3d(mortar_flux_copy_to_kernel,
+#                                                       size(surface_flux_values, 1),
+#                                                       size(surface_flux_values, 2)^2,
+#                                                       length(orientations))...)
+
+#     return nothing
+# end
+
 # Pack kernels for calculating mortar fluxes
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
                            equations, dg::DGSEM, cache)
@@ -2285,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
     tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
 
-    # TODO: Combine these two copy kernels into one (synchronization)
     mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
                                                                                 tmp_upper_left,
                                                                                 tmp_upper_right,
                                                                                 tmp_lower_left,
@@ -2304,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
                                                                                 neighbor_ids,
                                                                                 large_sides,
                                                                                 orientations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left,
-                               tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
-                               fstar_primary_lower_left, fstar_primary_lower_right,
-                               fstar_secondary_upper_left, fstar_secondary_upper_right,
-                               fstar_secondary_lower_left, fstar_secondary_lower_right,
-                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
-                               orientations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
-
-    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
-                                                                                tmp_surface_flux_values,
-                                                                                tmp_upper_left,
-                                                                                tmp_upper_right,
-                                                                                tmp_lower_left,
-                                                                                tmp_lower_right,
-                                                                                reverse_upper,
-                                                                                reverse_lower,
-                                                                                neighbor_ids,
-                                                                                large_sides,
-                                                                                orientations,
-                                                                                equations)
-    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
-                               tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper,
-                               reverse_lower, neighbor_ids, large_sides, orientations, equations;
-                               kernel_configurator_3d(mortar_flux_copy_to_kernel,
-                                                      size(surface_flux_values, 1),
-                                                      size(surface_flux_values, 2)^2,
-                                                      length(orientations))...)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right,
+                               tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right,
+                               fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left,
+                               fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations;
+                               cooperative = true,
+                               kernel_configurator_coop_3d(mortar_flux_copy_to_kernel,
+                                                           size(surface_flux_values, 1),
+                                                           size(surface_flux_values, 2)^2,
+                                                           length(orientations))...)
 
     return nothing
 end
diff --git a/test.jl b/test.jl
deleted file mode 100644
index 95ad959..0000000
--- a/test.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-# using CUDA
-
-# # Kernel using CG.this_grid() for grid-level synchronization
-# function simple_kernel(data)
-#     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-#     # Simple operation: increment each element
-#     if idx <= length(data)
-#         data[idx] += 1.0f0
-#     end
-
-#     grid = CG.this_grid()
-#     CG.sync(grid)
-
-#     return nothing
-# end
-
-# # Host code
-# function main()
-#     n = 1024
-#     data = CUDA.fill(Float32(1.0), n)  # Initialize array with 1.0s
-
-#     # Kernel launch configuration
-#     threads_per_block = 10
-#     num_blocks = ceil(Int, n / threads_per_block)
-
-#     println("num_blocks: ", num_blocks)
-
-#     # Launch kernel with cooperative launch enabled
-#     kernel = @cuda launch=false simple_kernel(data)
-
-#     kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true)
-
-#     # Fetch results back to host
-#     result = Array(data)
-# end
-
-# main()
-
-using CUDA
-
-function query_max_threads_per_block()
-    device = CUDA.device()  # Get the current CUDA device
-    max_threads = CUDA.capability(device).max_threads_per_block  # Query max threads per block
-    println("Maximum threads per block: $max_threads")
-end
-
-query_max_threads_per_block()
diff --git a/test2.jl b/test2.jl
deleted file mode 100644
index ab044f2..0000000
--- a/test2.jl
+++ /dev/null
@@ -1,97 +0,0 @@
-using Trixi, TrixiCUDA
-using CUDA
-using Test
-using BenchmarkTools
-
-include("test/test_macros.jl")
-
-advection_velocity = (0.2, -0.7, 0.5)
-equations = LinearScalarAdvectionEquation3D(advection_velocity)
-
-initial_condition = initial_condition_convergence_test
-solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
-
-coordinates_min = (-1.0, -1.0, -1.0)
-coordinates_max = (1.0, 1.0, 1.0)
-refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
-                       coordinates_max = (1.0, 1.0, 1.0)),
-                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
-                       coordinates_max = (0.5, 0.5, 0.5)))
-mesh = TreeMesh(coordinates_min, coordinates_max,
-                initial_refinement_level = 2,
-                refinement_patches = refinement_patches,
-                n_cells_max = 10_000)
-
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
-semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
-
-tspan = tspan_gpu = (0.0, 5.0)
-t = t_gpu = 0.0
-
-# Semi on CPU
-(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
-
-# Semi on GPU
-equations_gpu = semi_gpu.equations
-mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
-boundary_conditions_gpu = semi_gpu.boundary_conditions
-source_terms_gpu = semi_gpu.source_terms
-
-# ODE on CPU
-ode = semidiscretize(semi, tspan)
-u_ode = copy(ode.u0)
-du_ode = similar(u_ode)
-u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
-du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
-
-# ODE on GPU
-ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
-u_gpu = copy(ode_gpu.u0)
-du_gpu = similar(u_gpu)
-
-# Tests for components initialization
-@test_approx (u_gpu, u)
-# du is initlaizaed as undefined, cannot test now
-
-# Tests for semidiscretization process
-TrixiCUDA.reset_du!(du_gpu)
-Trixi.reset_du!(du, solver, cache)
-@test_approx (du_gpu, du)
-
-TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
-                                Trixi.have_nonconservative_terms(equations_gpu),
-                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
-                                cache_gpu)
-Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                            equations, solver.volume_integral, solver, cache)
-@test_approx (du_gpu, du)
-
-TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-@test_approx (cache_gpu.interfaces.u, cache.interfaces.u)
-
-TrixiCUDA.cuda_interface_flux!(mesh_gpu,
-                               Trixi.have_nonconservative_terms(equations_gpu),
-                               equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                           Trixi.have_nonconservative_terms(equations), equations,
-                           solver.surface_integral, solver, cache)
-@test_approx (cache_gpu.elements.surface_flux_values,
-              cache.elements.surface_flux_values)
-
-TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
-                                   equations_gpu, cache_gpu)
-Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-@test_approx (cache_gpu.boundaries.u, cache.boundaries.u)
-
-TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                              Trixi.have_nonconservative_terms(equations_gpu),
-                              equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                          solver.surface_integral, solver)
-@test_approx (cache_gpu.elements.surface_flux_values,
-              cache.elements.surface_flux_values)
-
-TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
-                                TrixiCUDA.check_cache_mortars(cache_gpu),
-                                solver_gpu, cache_gpu)