From 041da5749302c119beea6681e9817d762af5a9fb Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Thu, 29 Aug 2024 16:43:51 -1000
Subject: [PATCH 1/4] add mortar flux kernel to 2D solver

---
 src/auxiliary/helpers.jl         |   4 +-
 src/solvers/dg_2d.jl             | 176 +++++++++++++++++++++++++++----
 src/solvers/dg_3d.jl             |  36 +++++--
 test/test_adevction_mortar.jl    |  26 ++++-
 test/test_advection_basic.jl     |  11 ++
 test/test_euler_ec.jl            |  11 ++
 test/test_euler_source_terms.jl  |  11 ++
 test/test_hypdiff_nonperiodic.jl |  11 ++
 test/test_script.jl              |  53 +++-------
 9 files changed, 272 insertions(+), 67 deletions(-)

diff --git a/src/auxiliary/helpers.jl b/src/auxiliary/helpers.jl
index 8e7e7822..c099e1a8 100644
--- a/src/auxiliary/helpers.jl
+++ b/src/auxiliary/helpers.jl
@@ -25,9 +25,9 @@ end
 # Helper function for checking `cache.mortars`
 @inline function check_cache_mortars(cache)
     if iszero(length(cache.mortars.orientations))
-        return True()
-    else
         return False()
+    else
+        return True()
     end
 end
 
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index 3f3e31a4..5648297b 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -442,29 +442,29 @@ function prolong_mortars_large2small_kernel!(u_upper, u_lower, u, forward_upper,
         u2 = size(u, 2)
 
         @inbounds begin
-            for ii in axes(forward_upper, 2)
-                u_upper[leftright, i, j, k] += forward_upper[j, ii] *
+            for jj in axes(forward_upper, 2)
+                u_upper[leftright, i, j, k] += forward_upper[j, jj] *
                                                u[i,
-                                                 isequal(orientation, 1) * u2 + isequal(orientation, 2) * ii,
-                                                 isequal(orientation, 1) * ii + isequal(orientation, 2) * u2,
+                                                 isequal(orientation, 1) * u2 + isequal(orientation, 2) * jj,
+                                                 isequal(orientation, 1) * jj + isequal(orientation, 2) * u2,
                                                  large_element] * isequal(large_side, 1)
-                u_lower[leftright, i, j, k] += forward_lower[j, ii] *
+                u_lower[leftright, i, j, k] += forward_lower[j, jj] *
                                                u[i,
-                                                 isequal(orientation, 1) * u2 + isequal(orientation, 2) * ii,
-                                                 isequal(orientation, 1) * ii + isequal(orientation, 2) * u2,
+                                                 isequal(orientation, 1) * u2 + isequal(orientation, 2) * jj,
+                                                 isequal(orientation, 1) * jj + isequal(orientation, 2) * u2,
                                                  large_element] * isequal(large_side, 1)
             end
 
-            for ii in axes(forward_lower, 2)
-                u_upper[leftright, i, j, k] += forward_upper[j, ii] *
+            for jj in axes(forward_lower, 2)
+                u_upper[leftright, i, j, k] += forward_upper[j, jj] *
                                                u[i,
-                                                 isequal(orientation, 1) * 1 + isequal(orientation, 2) * ii,
-                                                 isequal(orientation, 1) * ii + isequal(orientation, 2) * 1,
+                                                 isequal(orientation, 1) * 1 + isequal(orientation, 2) * jj,
+                                                 isequal(orientation, 1) * jj + isequal(orientation, 2) * 1,
                                                  large_element] * isequal(large_side, 2)
-                u_lower[leftright, i, j, k] += forward_lower[j, ii] *
+                u_lower[leftright, i, j, k] += forward_lower[j, jj] *
                                                u[i,
-                                                 isequal(orientation, 1) * 1 + isequal(orientation, 2) * ii,
-                                                 isequal(orientation, 1) * ii + isequal(orientation, 2) * 1,
+                                                 isequal(orientation, 1) * 1 + isequal(orientation, 2) * jj,
+                                                 isequal(orientation, 1) * jj + isequal(orientation, 2) * 1,
                                                  large_element] * isequal(large_side, 2)
             end
         end
@@ -473,6 +473,82 @@ function prolong_mortars_large2small_kernel!(u_upper, u_lower, u, forward_upper,
     return nothing
 end
 
+# Kernel for calculating mortar fluxes
+function mortar_flux_kernel!(fstar_upper, fstar_lower, u_upper, u_lower, orientations, equations,
+                             surface_flux::Any)
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+
+    if (j <= size(u_upper, 3) && k <= length(orientations))
+        u_ll_upper, u_rr_upper = get_surface_node_vars(u_upper, equations, j, k)
+        u_ll_lower, u_rr_lower = get_surface_node_vars(u_lower, equations, j, k)
+        orientation = orientations[k]
+
+        flux_upper_node = surface_flux(u_ll_upper, u_rr_upper, orientation, equations)
+        flux_lower_node = surface_flux(u_ll_lower, u_rr_lower, orientation, equations)
+
+        @inbounds begin
+            for i in axes(fstar_upper, 1)
+                fstar_upper[i, j, k] = flux_upper_node[i]
+                fstar_lower[i, j, k] = flux_lower_node[i]
+            end
+        end
+    end
+
+    return nothing
+end
+
+# Kernel for copying mortar fluxes small to small and small to large
+function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, fstar_upper,
+                                     fstar_lower, reverse_upper, reverse_lower, neighbor_ids,
+                                     large_sides, orientations)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2) &&
+        k <= length(orientations))
+        large_element = neighbor_ids[3, k]
+        upper_element = neighbor_ids[2, k]
+        lower_element = neighbor_ids[1, k]
+
+        large_side = large_sides[k]
+        orientation = orientations[k]
+
+        # Use math expression to enhance performance (against control flow), it is equivalent to,
+        # `(2 - large_side) * (2 - orientation) * 1 + 
+        #  (2 - large_side) * (orientation - 1) * 3 +
+        #  (large_side - 1) * (2 - orientation) * 2 +
+        #  (large_side - 1) * (orientation - 1) * 4`.
+        direction = large_side + 2 * orientation - 2
+
+        surface_flux_values[i, j, direction, upper_element] = fstar_upper[i, j, k]
+        surface_flux_values[i, j, direction, lower_element] = fstar_lower[i, j, k]
+
+        # Use math expression to enhance performance (against control flow), it is equivalent to,
+        # `(2 - large_side) * (2 - orientation) * 2 + 
+        #  (2 - large_side) * (orientation - 1) * 4 +
+        #  (large_side - 1) * (2 - orientation) * 1 +
+        #  (large_side - 1) * (orientation - 1) * 3`.
+        direction = 2 * orientation - large_side + 1
+
+        @inbounds begin
+            for ii in axes(reverse_upper, 2) # i.e., ` for ii in axes(reverse_lower, 2)`
+                tmp_surface_flux_values[i, j, direction, large_element] += reverse_upper[j, ii] *
+                                                                           fstar_upper[i, ii, k] +
+                                                                           reverse_lower[j, ii] *
+                                                                           fstar_lower[i, ii, k]
+            end
+
+            surface_flux_values[i, j, direction, large_element] = tmp_surface_flux_values[i, j,
+                                                                                          direction,
+                                                                                          large_element]
+        end
+    end
+
+    return nothing
+end
+
 # Kernel for calculating surface integrals
 function surface_integral_kernel!(du, factor_arr, surface_flux_values,
                                   equations::AbstractEquations{2})
@@ -825,17 +901,18 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTup
 end
 
 # Dummy function for asserting mortars
-function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::True, dg::DGSEM, cache)
+function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::False, dg::DGSEM, cache)
     @assert iszero(length(cache.mortars.orientations))
 end
 
 # Pack kernels for prolonging solution to mortars
-function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::False, dg::DGSEM, cache)
+function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::True, dg::DGSEM, cache)
     neighbor_ids = CuArray{Int64}(cache.mortars.neighbor_ids)
     large_sides = CuArray{Int64}(cache.mortars.large_sides)
     orientations = CuArray{Int64}(cache.mortars.orientations)
-    u_upper = zero(CuArray{Float64}(cache.mortars.u_upper))
-    u_lower = zero(CuArray{Float64}(cache.mortars.u_lower))
+
+    u_upper = zero(CuArray{Float64}(cache.mortars.u_upper)) # NaN to zero
+    u_lower = zero(CuArray{Float64}(cache.mortars.u_lower)) # NaN to zero
 
     forward_upper = CuArray{Float64}(dg.mortar.forward_upper)
     forward_lower = CuArray{Float64}(dg.mortar.forward_lower)
@@ -871,6 +948,66 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::False, dg::D
     return nothing
 end
 
+# Dummy function for asserting mortar fluxes
+function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::False, nonconservative_terms,
+                           equations, dg::DGSEM, cache)
+    @assert iszero(length(cache.mortars.orientations))
+end
+
+# Pack kernels for calculating mortar fluxes
+function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservative_terms::False,
+                           equations, dg::DGSEM, cache)
+    surface_flux = dg.surface_integral.surface_flux
+
+    neighbor_ids = CuArray{Int64}(cache.mortars.neighbor_ids)
+    large_sides = CuArray{Int64}(cache.mortars.large_sides)
+    orientations = CuArray{Int64}(cache.mortars.orientations)
+
+    u_upper = CuArray{Float64}(cache.mortars.u_upper)
+    u_lower = CuArray{Float64}(cache.mortars.u_lower)
+    reverse_upper = CuArray{Float64}(dg.mortar.reverse_upper)
+    reverse_lower = CuArray{Float64}(dg.mortar.reverse_lower)
+
+    surface_flux_values = CuArray{Float64}(cache.elements.surface_flux_values)
+    tmp_surface_flux_values = zero(similar(surface_flux_values))
+
+    fstar_upper = CuArray{Float64}(undef, size(u_upper, 2), size(u_upper, 3), length(orientations))
+    fstar_lower = CuArray{Float64}(undef, size(u_upper, 2), size(u_upper, 3), length(orientations))
+
+    size_arr = CuArray{Float64}(undef, size(u_upper, 3), length(orientations))
+
+    mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper, fstar_lower, u_upper,
+                                                                u_lower, orientations, equations,
+                                                                surface_flux)
+    mortar_flux_kernel(fstar_upper, fstar_lower, u_upper, u_lower, orientations, equations,
+                       surface_flux;
+                       configurator_2d(mortar_flux_kernel, size_arr)...)
+
+    size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2),
+                                length(orientations))
+
+    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
+                                                                                fstar_upper,
+                                                                                fstar_lower,
+                                                                                reverse_upper,
+                                                                                reverse_lower,
+                                                                                neighbor_ids,
+                                                                                large_sides,
+                                                                                orientations)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_upper,
+                               fstar_lower,
+                               reverse_upper, reverse_lower, neighbor_ids, large_sides,
+                               orientations;
+                               configurator_3d(mortar_flux_copy_to_kernel, size_arr)...)
+
+    cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically
+end
+
+function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservative_terms::True,
+                           equations, dg::DGSEM, cache)
+end
+
 # Pack kernels for calculating surface integrals
 function cuda_surface_integral!(du, mesh::TreeMesh{2}, equations, dg::DGSEM, cache)
     factor_arr = CuArray{Float64}([
@@ -941,6 +1078,9 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{2}, equations, initial_condit
 
     cuda_prolong2mortars!(u, mesh, check_cache_mortars(cache), dg, cache)
 
+    cuda_mortar_flux!(mesh, check_cache_mortars(cache), have_nonconservative_terms(equations),
+                      equations, dg, cache)
+
     cuda_surface_integral!(du, mesh, equations, dg, cache)
 
     cuda_jacobian!(du, mesh, equations, cache)
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 5918e17d..f30f642a 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -755,19 +755,20 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTup
 end
 
 # Dummy function for asserting mortars 
-function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
+function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::DGSEM, cache)
     @assert iszero(length(cache.mortars.orientations))
 end
 
 # Pack kernels for prolonging solution to mortars
-function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::DGSEM, cache)
+function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache)
     neighbor_ids = CuArray{Int64}(cache.mortars.neighbor_ids)
     large_sides = CuArray{Int64}(cache.mortars.large_sides)
     orientations = CuArray{Int64}(cache.mortars.orientations)
-    u_upper_left = zero(CuArray{Float64}(cache.mortars.u_upper_left))
-    u_upper_right = zero(CuArray{Float64}(cache.mortars.u_upper_right))
-    u_lower_left = zero(CuArray{Float64}(cache.mortars.u_lower_left))
-    u_lower_right = zero(CuArray{Float64}(cache.mortars.u_lower_right))
+
+    u_upper_left = zero(CuArray{Float64}(cache.mortars.u_upper_left)) # NaN to zero
+    u_upper_right = zero(CuArray{Float64}(cache.mortars.u_upper_right)) # NaN to zero
+    u_lower_left = zero(CuArray{Float64}(cache.mortars.u_lower_left)) # NaN to zero
+    u_lower_right = zero(CuArray{Float64}(cache.mortars.u_lower_right)) # NaN to zero
 
     forward_upper = CuArray{Float64}(dg.mortar.forward_upper)
     forward_lower = CuArray{Float64}(dg.mortar.forward_lower)
@@ -787,10 +788,10 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D
                                        neighbor_ids, large_sides, orientations;
                                        configurator_3d(prolong_mortars_small2small_kernel,
                                                        size_arr)...)
-    tmp_upper_left = zero(similar(u_upper_left))
-    tmp_upper_right = zero(similar(u_upper_right))
-    tmp_lower_left = zero(similar(u_lower_left))
-    tmp_lower_right = zero(similar(u_lower_right))
+    tmp_upper_left = zero(similar(u_upper_left)) # undef to zero
+    tmp_upper_right = zero(similar(u_upper_right)) # undef to zero
+    tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
+    tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
 
     size_arr = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3)^2,
                                 size(u_upper_left, 5))
@@ -825,6 +826,21 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D
     return nothing
 end
 
+# Dummy function for asserting mortar fluxes
+function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservative_terms,
+                           equations, dg::DGSEM, cache)
+    @assert iszero(length(cache.mortars.orientations))
+end
+
+# Pack kernels for calculating mortar fluxes
+function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
+                           equations, dg::DGSEM, cache)
+end
+
+function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
+                           equations, dg::DGSEM, cache)
+end
+
 # Pack kernels for calculating surface integrals
 function cuda_surface_integral!(du, mesh::TreeMesh{3}, equations, dg::DGSEM, cache)
     factor_arr = CuArray{Float64}([
diff --git a/test/test_adevction_mortar.jl b/test/test_adevction_mortar.jl
index bdcd5725..255602ed 100644
--- a/test/test_adevction_mortar.jl
+++ b/test/test_adevction_mortar.jl
@@ -113,7 +113,31 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
-        # To be implemented
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
+        # Test `cuda_surface_integral!`
+        TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+        Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
+
+        # Test `cuda_jacobian!`
+        TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+        Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
+
+        # Test `cuda_sources!`
+        TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
+        Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
 
         # Copy data back to host
         du_cpu, u_cpu = TrixiGPU.copy_to_host!(du_gpu, u_gpu)
diff --git a/test/test_advection_basic.jl b/test/test_advection_basic.jl
index 8dfb2f5f..6f66ae52 100644
--- a/test/test_advection_basic.jl
+++ b/test/test_advection_basic.jl
@@ -208,6 +208,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_euler_ec.jl b/test/test_euler_ec.jl
index 47cd4e50..a4a138ac 100644
--- a/test/test_euler_ec.jl
+++ b/test/test_euler_ec.jl
@@ -211,6 +211,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_euler_source_terms.jl b/test/test_euler_source_terms.jl
index 41559d80..5c3ee822 100644
--- a/test/test_euler_source_terms.jl
+++ b/test/test_euler_source_terms.jl
@@ -209,6 +209,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_hypdiff_nonperiodic.jl b/test/test_hypdiff_nonperiodic.jl
index 5bf6600e..77b05cac 100644
--- a/test/test_hypdiff_nonperiodic.jl
+++ b/test/test_hypdiff_nonperiodic.jl
@@ -221,6 +221,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_script.jl b/test/test_script.jl
index 779524bd..8bd25ce4 100644
--- a/test/test_script.jl
+++ b/test/test_script.jl
@@ -3,27 +3,25 @@ using OrdinaryDiffEq
 using CUDA
 using Test
 
-advection_velocity = (0.2, -0.7, 0.5)
-equations = LinearScalarAdvectionEquation3D(advection_velocity)
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
 
 initial_condition = initial_condition_convergence_test
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
 
-coordinates_min = (-1.0, -1.0, -1.0)
-coordinates_max = (1.0, 1.0, 1.0)
-refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
-                       coordinates_max = (1.0, 1.0, 1.0)),
-                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
-                       coordinates_max = (0.5, 0.5, 0.5)))
+coordinates_min = (-1.0, -1.0)
+coordinates_max = (1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0),
+                       coordinates_max = (1.0, 1.0)),)
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 2,
                 refinement_patches = refinement_patches,
                 n_cells_max = 10_000)
 
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
-
 (; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi
-# Get copy for GPU to avoid overwriting during tests
+
+# Get deep copy for GPU to avoid overwriting during tests
 mesh_gpu, equations_gpu = deepcopy(mesh), deepcopy(equations)
 initial_condition_gpu, boundary_conditions_gpu, source_terms_gpu = deepcopy(initial_condition),
                                                                    deepcopy(boundary_conditions),
@@ -31,7 +29,7 @@ initial_condition_gpu, boundary_conditions_gpu, source_terms_gpu = deepcopy(init
 solver_gpu, cache_gpu = deepcopy(solver), deepcopy(cache)
 
 t = t_gpu = 0.0
-tspan = (0.0, 5.0)
+tspan = (0.0, 1.0)
 
 ode = semidiscretize(semi, tspan)
 u_ode = copy(ode.u0)
@@ -72,33 +70,16 @@ TrixiGPU.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, equations
 Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
                           solver.surface_integral, solver)
 
+# Test `cuda_prolong2mortars!`
 TrixiGPU.cuda_prolong2mortars!(u_gpu, mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu), solver_gpu,
                                cache_gpu)
 Trixi.prolong2mortars!(cache, u, mesh, equations,
                        solver.mortar, solver.surface_integral, solver)
 
-u_upper_left = cache_gpu.mortars.u_upper_left
-u_upper_right = cache_gpu.mortars.u_upper_right
-u_lower_left = cache_gpu.mortars.u_lower_left
-u_lower_right = cache_gpu.mortars.u_lower_right
-
-# u_upper_left1 = cache.mortars.u_upper_left
-# u_upper_right1 = cache.mortars.u_upper_right
-# u_lower_left1 = cache.mortars.u_lower_left
-# u_lower_right1 = cache.mortars.u_lower_right
-
-# @test cache_gpu.mortars.u_upper == cache.mortars.u_upper
-# @test cache_gpu.mortars.u_lower == cache.mortars.u_lower
-
-# # Test `cuda_surface_integral!`
-# TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-# Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
-
-# # Test `cuda_jacobian!`
-# TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-# Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-
-# # Test `cuda_sources!`
-# TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
-# Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-# @test CUDA.@allowscalar du ≈ du_gpu
+TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                           Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                           solver_gpu, cache_gpu)
+Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                        Trixi.have_nonconservative_terms(equations), equations,
+                        solver.mortar, solver.surface_integral, solver, cache)
+@test cache_gpu.elements.surface_flux_values ≈ cache.elements.surface_flux_values

From a3782487412a57be38c370d12d3399d75ad42ed6 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Thu, 29 Aug 2024 22:55:47 -1000
Subject: [PATCH 2/4] add mortar flux kernel to 3D solver

---
 src/solvers/dg_3d.jl | 228 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 225 insertions(+), 3 deletions(-)

diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index f30f642a..0e66c142 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -479,6 +479,158 @@ function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lowe
     return nothing
 end
 
+# Kernel for calculating mortar fluxes
+function mortar_flux_kernel!(fstar_upper_left, fstar_upper_right, fstar_lower_left,
+                             fstar_lower_right, u_upper_left, u_upper_right, u_lower_left,
+                             u_lower_right, orientations, equations, surface_flux::Any)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(u_upper_left, 3) && j <= size(u_upper_left, 4) && k <= length(orientations))
+        u_ll_upper_left, u_rr_upper_left = get_surface_node_vars(u_upper_left, equations, i, j, k)
+        u_ll_upper_right, u_rr_upper_right = get_surface_node_vars(u_upper_right, equations, i, j,
+                                                                   k)
+        u_ll_lower_left, u_rr_lower_left = get_surface_node_vars(u_lower_left, equations, i, j, k)
+        u_ll_lower_right, u_rr_lower_right = get_surface_node_vars(u_lower_right, equations, i, j,
+                                                                   k)
+
+        orientation = orientations[k]
+
+        flux_upper_left_node = surface_flux(u_ll_upper_left, u_rr_upper_left, orientation,
+                                            equations)
+        flux_upper_right_node = surface_flux(u_ll_upper_right, u_rr_upper_right, orientation,
+                                             equations)
+        flux_lower_left_node = surface_flux(u_ll_lower_left, u_rr_lower_left, orientation,
+                                            equations)
+        flux_lower_right_node = surface_flux(u_ll_lower_right, u_rr_lower_right, orientation,
+                                             equations)
+
+        @inbounds begin
+            for ii in axes(fstar_upper_left, 1)
+                fstar_upper_left[ii, i, j, k] = flux_upper_left_node[ii]
+                fstar_upper_right[ii, i, j, k] = flux_upper_right_node[ii]
+                fstar_lower_left[ii, i, j, k] = flux_lower_left_node[ii]
+                fstar_lower_right[ii, i, j, k] = flux_lower_right_node[ii]
+            end
+        end
+    end
+
+    return nothing
+end
+
+# Kernel for copying mortar fluxes small to small and small to large
+function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+                                     tmp_upper_right, tmp_lower_left, tmp_lower_right,
+                                     fstar_upper_left,
+                                     fstar_upper_right, fstar_lower_left, fstar_lower_right,
+                                     reverse_upper,
+                                     reverse_lower, neighbor_ids, large_sides, orientations)
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 &&
+        k <= length(orientations))
+        j1 = div(j - 1, size(surface_flux_values, 2)) + 1
+        j2 = rem(j - 1, size(surface_flux_values, 2)) + 1
+
+        lower_left_element = neighbor_ids[1, k]
+        lower_right_element = neighbor_ids[2, k]
+        upper_left_element = neighbor_ids[3, k]
+        upper_right_element = neighbor_ids[4, k]
+        large_element = neighbor_ids[5, k]
+
+        large_side = large_sides[k]
+        orientation = orientations[k]
+
+        # Use simple math expression to enhance the performance (against control flow), 
+        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 +
+        #                       isequal(large_side, 1) * isequal(orientation, 2) * 3 +
+        #                       isequal(large_side, 1) * isequal(orientation, 3) * 5 +
+        #                       isequal(large_side, 2) * isequal(orientation, 1) * 2 +
+        #                       isequal(large_side, 2) * isequal(orientation, 2) * 4 +
+        #                       isequal(large_side, 2) * isequal(orientation, 3) * 6`.
+        # Please also check the original code in Trixi.jl when you modify this code.
+        direction = 2 * orientation + large_side - 2
+
+        surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_upper_left[i, j1, j2,
+                                                                                         k]
+        surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_upper_right[i, j1,
+                                                                                           j2, k]
+        surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_lower_left[i, j1, j2,
+                                                                                         k]
+        surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_lower_right[i, j1,
+                                                                                           j2, k]
+
+        # Use simple math expression to enhance the performance (against control flow), 
+        # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 +
+        #                       isequal(large_side, 1) * isequal(orientation, 2) * 4 +
+        #                       isequal(large_side, 1) * isequal(orientation, 3) * 6 +
+        #                       isequal(large_side, 2) * isequal(orientation, 1) * 1 +
+        #                       isequal(large_side, 2) * isequal(orientation, 2) * 3 +
+        #                       isequal(large_side, 2) * isequal(orientation, 3) * 5`.
+        # Please also check the original code in Trixi.jl when you modify this code.
+        direction = 2 * orientation - large_side + 1
+
+        @inbounds begin
+            for j1j1 in axes(reverse_upper, 2)
+                tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                       fstar_upper_left[i, j1j1, j2,
+                                                                                        k]
+                tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                        fstar_upper_right[i, j1j1,
+                                                                                          j2, k]
+                tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] *
+                                                                       fstar_lower_left[i, j1j1, j2,
+                                                                                        k]
+                tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] *
+                                                                        fstar_lower_right[i, j1j1,
+                                                                                          j2, k]
+            end
+
+            for j2j2 in axes(reverse_lower, 2)
+                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2,
+                                                                                              j2j2] *
+                                                                                tmp_upper_left[i,
+                                                                                               j1,
+                                                                                               j2j2,
+                                                                                               direction,
+                                                                                               large_element]
+                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2,
+                                                                                              j2j2] *
+                                                                                tmp_upper_right[i,
+                                                                                                j1,
+                                                                                                j2j2,
+                                                                                                direction,
+                                                                                                large_element]
+                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2,
+                                                                                              j2j2] *
+                                                                                tmp_lower_left[i,
+                                                                                               j1,
+                                                                                               j2j2,
+                                                                                               direction,
+                                                                                               large_element]
+                tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2,
+                                                                                              j2j2] *
+                                                                                tmp_lower_right[i,
+                                                                                                j1,
+                                                                                                j2j2,
+                                                                                                direction,
+                                                                                                large_element]
+            end
+
+            surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i,
+                                                                                               j1,
+                                                                                               j2,
+                                                                                               direction,
+                                                                                               large_element]
+        end
+    end
+
+    return nothing
+end
+
 # Kernel for calculating surface integrals
 function surface_integral_kernel!(du, factor_arr, surface_flux_values,
                                   equations::AbstractEquations{3})
@@ -793,9 +945,6 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG
     tmp_lower_left = zero(similar(u_lower_left)) # undef to zero
     tmp_lower_right = zero(similar(u_lower_right)) # undef to zero
 
-    size_arr = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3)^2,
-                                size(u_upper_left, 5))
-
     prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left,
                                                                                                 u_upper_right,
                                                                                                 u_lower_left,
@@ -835,6 +984,76 @@ end
 # Pack kernels for calculating mortar fluxes
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False,
                            equations, dg::DGSEM, cache)
+    surface_flux = dg.surface_integral.surface_flux
+
+    neighbor_ids = CuArray{Int64}(cache.mortars.neighbor_ids)
+    large_sides = CuArray{Int64}(cache.mortars.large_sides)
+    orientations = CuArray{Int64}(cache.mortars.orientations)
+
+    u_upper_left = CuArray{Float64}(cache.mortars.u_upper_left)
+    u_upper_right = CuArray{Float64}(cache.mortars.u_upper_right)
+    u_lower_left = CuArray{Float64}(cache.mortars.u_lower_left)
+    u_lower_right = CuArray{Float64}(cache.mortars.u_lower_right)
+    reverse_upper = CuArray{Float64}(dg.mortar.reverse_upper)
+    reverse_lower = CuArray{Float64}(dg.mortar.reverse_lower)
+
+    surface_flux_values = CuArray{Float64}(cache.elements.surface_flux_values)
+    tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero
+
+    fstar_upper_left = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3),
+                                        size(u_upper_left, 4), length(orientations))
+    fstar_upper_right = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3),
+                                         size(u_upper_left, 4), length(orientations))
+    fstar_lower_left = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3),
+                                        size(u_upper_left, 4), length(orientations))
+    fstar_lower_right = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3),
+                                         size(u_upper_left, 4), length(orientations))
+
+    size_arr = CuArray{Float64}(undef, size(u_upper_left, 3), size(u_upper_left, 4),
+                                length(orientations))
+
+    mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper_left, fstar_upper_right,
+                                                                fstar_lower_left, fstar_lower_right,
+                                                                u_upper_left, u_upper_right,
+                                                                u_lower_left, u_lower_right,
+                                                                orientations, equations,
+                                                                surface_flux)
+    mortar_flux_kernel(fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right,
+                       u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations,
+                       equations, surface_flux;
+                       configurator_3d(mortar_flux_kernel, size_arr)...)
+
+    tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero
+    tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero
+    tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero
+    tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero
+
+    size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2)^2,
+                                length(orientations))
+
+    mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values,
+                                                                                tmp_surface_flux_values,
+                                                                                tmp_upper_left,
+                                                                                tmp_upper_right,
+                                                                                tmp_lower_left,
+                                                                                tmp_lower_right,
+                                                                                fstar_upper_left,
+                                                                                fstar_upper_right,
+                                                                                fstar_lower_left,
+                                                                                fstar_lower_right,
+                                                                                reverse_upper,
+                                                                                reverse_lower,
+                                                                                neighbor_ids,
+                                                                                large_sides,
+                                                                                orientations)
+    mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left,
+                               tmp_upper_right, tmp_lower_left, tmp_lower_right, fstar_upper_left,
+                               fstar_upper_right, fstar_lower_left, fstar_lower_right,
+                               reverse_upper,
+                               reverse_lower, neighbor_ids, large_sides, orientations;
+                               configurator_3d(mortar_flux_copy_to_kernel, size_arr)...)
+
+    cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically
 end
 
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
@@ -911,6 +1130,9 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{3}, equations, initial_condit
 
     cuda_prolong2mortars!(u, mesh, cache.mortars, dg, cache)
 
+    cuda_mortar_flux!(mesh, cache.mortars, have_nonconservative_terms(equations),
+                      equations, dg, cache)
+
     cuda_surface_integral!(du, mesh, equations, dg, cache)
 
     cuda_jacobian!(du, mesh, equations, cache)

From 52074eb8bb1a10985ba199c665948e22dfe8c55e Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Thu, 29 Aug 2024 23:00:35 -1000
Subject: [PATCH 3/4] add tests for mortar flux kernel

---
 test/test_adevction_mortar.jl           | 26 ++++++++++++++++++++++++-
 test/test_advection_basic.jl            | 11 +++++++++++
 test/test_euler_ec.jl                   | 11 +++++++++++
 test/test_euler_source_terms.jl         | 11 +++++++++++
 test/test_hypdiff_nonperiodic.jl        | 11 +++++++++++
 test/test_script.jl                     | 16 ++++++++-------
 test/test_shallowwater_well_balanced.jl | 11 +++++++++++
 7 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/test/test_adevction_mortar.jl b/test/test_adevction_mortar.jl
index 255602ed..6b106b96 100644
--- a/test/test_adevction_mortar.jl
+++ b/test/test_adevction_mortar.jl
@@ -245,7 +245,31 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_lower_left_gpu ≈ u_lower_left
         @test u_lower_right_gpu ≈ u_lower_right
 
-        # To be implemented
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
+        # Test `cuda_surface_integral!`
+        TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+        Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
+
+        # Test `cuda_jacobian!`
+        TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+        Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
+
+        # Test `cuda_sources!`
+        TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
+        Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
 
         # Copy data back to host
         du_cpu, u_cpu = TrixiGPU.copy_to_host!(du_gpu, u_gpu)
diff --git a/test/test_advection_basic.jl b/test/test_advection_basic.jl
index 6f66ae52..7d8de9b3 100644
--- a/test/test_advection_basic.jl
+++ b/test/test_advection_basic.jl
@@ -335,6 +335,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_lower_left_gpu ≈ u_lower_left
         @test u_lower_right_gpu ≈ u_lower_right
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_euler_ec.jl b/test/test_euler_ec.jl
index a4a138ac..a8c9a25c 100644
--- a/test/test_euler_ec.jl
+++ b/test/test_euler_ec.jl
@@ -339,6 +339,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_lower_left_gpu ≈ u_lower_left
         @test u_lower_right_gpu ≈ u_lower_right
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_euler_source_terms.jl b/test/test_euler_source_terms.jl
index 5c3ee822..7a62255d 100644
--- a/test/test_euler_source_terms.jl
+++ b/test/test_euler_source_terms.jl
@@ -338,6 +338,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_lower_left_gpu ≈ u_lower_left
         @test u_lower_right_gpu ≈ u_lower_right
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_hypdiff_nonperiodic.jl b/test/test_hypdiff_nonperiodic.jl
index 77b05cac..a6e9b21f 100644
--- a/test/test_hypdiff_nonperiodic.jl
+++ b/test/test_hypdiff_nonperiodic.jl
@@ -357,6 +357,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_lower_left_gpu ≈ u_lower_left
         @test u_lower_right_gpu ≈ u_lower_right
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Test `cuda_surface_integral!`
         TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
         Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
diff --git a/test/test_script.jl b/test/test_script.jl
index 8bd25ce4..78eec607 100644
--- a/test/test_script.jl
+++ b/test/test_script.jl
@@ -3,16 +3,18 @@ using OrdinaryDiffEq
 using CUDA
 using Test
 
-advection_velocity = (0.2, -0.7)
-equations = LinearScalarAdvectionEquation2D(advection_velocity)
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
 
 initial_condition = initial_condition_convergence_test
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
 
-coordinates_min = (-1.0, -1.0)
-coordinates_max = (1.0, 1.0)
-refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0),
-                       coordinates_max = (1.0, 1.0)),)
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
+                       coordinates_max = (1.0, 1.0, 1.0)),
+                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
+                       coordinates_max = (0.5, 0.5, 0.5)))
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 2,
                 refinement_patches = refinement_patches,
@@ -29,7 +31,7 @@ initial_condition_gpu, boundary_conditions_gpu, source_terms_gpu = deepcopy(init
 solver_gpu, cache_gpu = deepcopy(solver), deepcopy(cache)
 
 t = t_gpu = 0.0
-tspan = (0.0, 1.0)
+tspan = (0.0, 5.0)
 
 ode = semidiscretize(semi, tspan)
 u_ode = copy(ode.u0)
diff --git a/test/test_shallowwater_well_balanced.jl b/test/test_shallowwater_well_balanced.jl
index 18345712..9ab2fca9 100644
--- a/test/test_shallowwater_well_balanced.jl
+++ b/test/test_shallowwater_well_balanced.jl
@@ -238,6 +238,17 @@ isdir(outdir) && rm(outdir, recursive = true)
         @test u_upper_gpu ≈ u_upper
         @test u_lower_gpu ≈ u_lower
 
+        # Test `cuda_mortar_flux!`
+        TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+                                   Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+                                   solver_gpu, cache_gpu)
+        Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                                Trixi.have_nonconservative_terms(equations), equations,
+                                solver.mortar, solver.surface_integral, solver, cache)
+        surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+        surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+        @test surface_flux_values_gpu ≈ surface_flux_values
+
         # Error when testing please check
 
         # # Test `cuda_surface_integral!`

From 8e31fddcabc6be88d144277c1bdb9bed2e6c76e6 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Thu, 29 Aug 2024 23:26:36 -1000
Subject: [PATCH 4/4] add examples and update README.md

---
 README.md                       |  4 +--
 examples/advection_mortar_2d.jl | 58 ++++++++++++++++++++++++++++++
 examples/advection_mortar_3d.jl | 62 +++++++++++++++++++++++++++++++++
 src/solvers/dg_3d.jl            |  4 +--
 4 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 examples/advection_mortar_2d.jl
 create mode 100644 examples/advection_mortar_3d.jl

diff --git a/README.md b/README.md
index c3dcb1ba..acfaa0df 100644
--- a/README.md
+++ b/README.md
@@ -73,8 +73,8 @@ Our current focus is on the semidiscretization of PDEs. The table below shows th
 # GPU Kernels to be Implemented
 Kernels left to be implemented on `TreeMesh` with `DGSEM`:
 - 1D Kernels: 1) `calc_volume_integral!` - `volume_integral::VolumeIntegralShockCapturingHG`
-- 2D Kernels: 1) `calc_volume_integral!` - `volume_integral::VolumeIntegralShockCapturingHG`, 2) `calc_mortar_flux!`
-- 3D Kernels: 1) `calc_volume_integral!` - `volume_integral::VolumeIntegralShockCapturingHG`, 2) `calc_mortar_flux!`
+- 2D Kernels: 1) `calc_volume_integral!` - `volume_integral::VolumeIntegralShockCapturingHG`, 2) `calc_mortar_flux!`- `nonconservative_terms::True`
+- 3D Kernels: 1) `calc_volume_integral!` - `volume_integral::VolumeIntegralShockCapturingHG`, 2) `calc_mortar_flux!` - `nonconservative_terms::True`
 
 # Show Your Support!
 We always welcome new people to join us, please feel free to contribute. Also, if you find this package interesting and inspiring, please give it a star. Thanks!
diff --git a/examples/advection_mortar_2d.jl b/examples/advection_mortar_2d.jl
new file mode 100644
index 00000000..68a2500f
--- /dev/null
+++ b/examples/advection_mortar_2d.jl
@@ -0,0 +1,58 @@
+using Trixi, TrixiGPU
+using OrdinaryDiffEq
+
+# The example is taken from the Trixi.jl 
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+initial_condition = initial_condition_convergence_test
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0)
+coordinates_max = (1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0),
+                       coordinates_max = (1.0, 1.0)),)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                refinement_patches = refinement_patches,
+                n_cells_max = 10_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 1.0)
+ode = semidiscretize_gpu(semi, tspan) # from TrixiGPU.jl
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_integrals = (entropy,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 100,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim)
+
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
+summary_callback() # print the timer summary
diff --git a/examples/advection_mortar_3d.jl b/examples/advection_mortar_3d.jl
new file mode 100644
index 00000000..70e28efa
--- /dev/null
+++ b/examples/advection_mortar_3d.jl
@@ -0,0 +1,62 @@
+
+using Trixi, TrixiGPU
+using OrdinaryDiffEq
+
+# The example is taken from the Trixi.jl
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+initial_condition = initial_condition_convergence_test
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0),
+                       coordinates_max = (1.0, 1.0, 1.0)),
+                      (type = "box", coordinates_min = (0.0, -0.5, -0.5),
+                       coordinates_max = (0.5, 0.5, 0.5)))
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                refinement_patches = refinement_patches,
+                n_cells_max = 10_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 5.0)
+ode = semidiscretize_gpu(semi, tspan) # from TrixiGPU.jl
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_integrals = (entropy,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 100,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim)
+
+stepsize_callback = StepsizeCallback(cfl = 1.2)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback,
+                        alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
+summary_callback() # print the timer summary
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 0e66c142..5c9f45d0 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -1128,9 +1128,9 @@ function rhs_gpu!(du_cpu, u_cpu, t, mesh::TreeMesh{3}, equations, initial_condit
 
     cuda_boundary_flux!(t, mesh, boundary_conditions, equations, dg, cache)
 
-    cuda_prolong2mortars!(u, mesh, cache.mortars, dg, cache)
+    cuda_prolong2mortars!(u, mesh, check_cache_mortars(cache), dg, cache)
 
-    cuda_mortar_flux!(mesh, cache.mortars, have_nonconservative_terms(equations),
+    cuda_mortar_flux!(mesh, check_cache_mortars(cache), have_nonconservative_terms(equations),
                       equations, dg, cache)
 
     cuda_surface_integral!(du, mesh, equations, dg, cache)