From a015ecb40d34239f0c2f064dade5f7a633498074 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Sun, 8 Sep 2024 20:39:43 -1000
Subject: [PATCH 1/5] Start

---
 src/solvers/dg_1d.jl |  6 ++++++
 src/solvers/dg_2d.jl |  6 ++++++
 src/solvers/dg_3d.jl | 22 +++++++++++++++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index c8e7963..c72c1e7 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -645,6 +645,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
+function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations,
+                               volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
+    return nothing
+end
+
 # Pack kernels for prolonging solution to interfaces
 function cuda_prolong2interfaces!(u, mesh::TreeMesh{1}, equations, cache)
     neighbor_ids = CuArray{Int64}(cache.interfaces.neighbor_ids)
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index 1feb5a6..928de48 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -996,6 +996,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
+function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::True, equations,
+                               volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
+    return nothing
+end
+
 # Pack kernels for prolonging solution to interfaces
 function cuda_prolong2interfaces!(u, mesh::TreeMesh{2}, equations, cache)
     neighbor_ids = CuArray{Int64}(cache.interfaces.neighbor_ids)
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 8c0197e..e280464 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -937,6 +937,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms,
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM, cache)
     volume_flux = volume_integral.volume_flux
@@ -971,12 +972,14 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::True, equations,
                                volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM, cache)
     # Wait for the unmutable MHD implementation in Trixi.jl
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
     volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg,
@@ -1073,6 +1076,13 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     return nothing
 end
 
+# Pack kernels for calculating volume integrals
+function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::True, equations,
+                               volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
+    # Wait for the unmutable MHD implementation in Trixi.jl
+    return nothing
+end
+
 # Pack kernels to prolonging solution to interfaces
 function cuda_prolong2interfaces!(u, mesh::TreeMesh{3}, equations, cache)
     neighbor_ids = CuArray{Int64}(cache.interfaces.neighbor_ids)
@@ -1169,7 +1179,7 @@ end
 
 # Pack kernels for calculating boundary fluxes
 function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTuple,
-                             nonconservative_terms, equations, dg::DGSEM, cache)
+                             nonconservative_terms::False, equations, dg::DGSEM, cache)
     surface_flux = dg.surface_integral.surface_flux
 
     n_boundaries_per_direction = CuArray{Int64}(cache.boundaries.n_boundaries_per_direction)
@@ -1212,6 +1222,13 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTup
     return nothing
 end
 
+# Pack kernels for calculating boundary fluxes
+function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTuple,
+                             nonconservative_terms::True, equations, dg::DGSEM, cache)
+    # Wait for the unmutable MHD implementation in Trixi.jl
+    return nothing
+end
+
 # Dummy function for asserting mortars 
 function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::DGSEM, cache)
     @assert iszero(length(cache.mortars.orientations))
@@ -1362,8 +1379,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati
     cache.elements.surface_flux_values = surface_flux_values # copy back to host automatically
 end
 
+# Pack kernels for calculating mortar fluxes
 function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True,
                            equations, dg::DGSEM, cache)
+    # Wait for the unmutable MHD implementation in Trixi.jl
+    return nothing
 end
 
 # Pack kernels for calculating surface integrals

From b892c5338aa7b443383d97aeb02a9efabcdb1ab9 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Tue, 10 Sep 2024 16:39:41 -1000
Subject: [PATCH 2/5] Add optimization comments

---
 src/solvers/common.jl |   2 +-
 src/solvers/dg_1d.jl  | 107 +++++++++++++++++++
 src/solvers/dg_2d.jl  |   3 +
 src/solvers/dg_3d.jl  |   3 +
 test/test_script.jl   | 236 +++++++++++++++++++++++++-----------------
 5 files changed, 254 insertions(+), 97 deletions(-)

diff --git a/src/solvers/common.jl b/src/solvers/common.jl
index 2bc6603..c7ce5bd 100644
--- a/src/solvers/common.jl
+++ b/src/solvers/common.jl
@@ -49,7 +49,7 @@ function pure_blended_element_count_kernel!(element_ids_dg, element_ids_dgfv, al
     if (i <= length(alpha))
         dg_only = isapprox(alpha[i], 0, atol = atol)
 
-        if dg_only
+        if dg_only # bad
             element_ids_dg[i] = i
         else
             element_ids_dgfv[i] = i
diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index c72c1e7..4486c8d 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -149,6 +149,9 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u,
 
         element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
 
+        # The sets of `get_node_vars` operations may be combined
+        # into a single set of operation for better performance (to be explored)
+
         u_node = get_node_vars(u, equations, j1, k)
         u_node1 = get_node_vars(u, equations, j2, k)
 
@@ -645,9 +648,113 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
     return nothing
 end
 
+# Kernel for calculating pure DG and DG-FV volume fluxes
+function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
+                                  element_ids_dgfv, derivative_split,
+                                  equations::AbstractEquations{1},
+                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
+                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+
+    if (j <= size(u, 2)^2 && k <= size(u, 3))
+        # length(element_ids_dgfv) == size(u, 3)
+
+        j1 = div(j - 1, size(u, 2)) + 1
+        j2 = rem(j - 1, size(u, 2)) + 1
+
+        element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+
+        # The sets of `get_node_vars` operations may be combined
+        # into a single set of operation for better performance (to be explored)
+
+        u_node = get_node_vars(u, equations, j1, k)
+        u_node1 = get_node_vars(u, equations, j2, k)
+
+        volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations)
+        noncons_flux_node = nonconservative_flux_dg(u_node, u_node1, 1, equations)
+
+        @inbounds begin
+            for ii in axes(u, 1)
+                volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii]
+                noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] * derivative_split[j1, j2]
+            end
+        end
+
+        if j1 !== 1 && j2 === 1 && element_dgfv !== 0 # bad
+            u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv)
+            u_rr = get_node_vars(u, equations, j1, element_dgfv)
+
+            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
+            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
+
+            @inbounds begin
+                for ii in axes(u, 1)
+                    fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_L_node[ii]
+                    fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_R_node[ii]
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
 # Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations,
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
+    volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg
+    volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv
+    indicator = dg.volume_integral.indicator
+
+    # TODO: Get copies of `u` and `du` on both device and host
+    alpha = indicator(Array(u), mesh, equations, dg, cache)
+    alpha = CuArray{Float64}(alpha)
+
+    # For `Float64`, this gives 1.8189894035458565e-12
+    # For `Float32`, this gives 1.1920929f-5
+    atol = 1.8189894035458565e-12 # Ref: `pure_and_blended_element_ids!` in Trixi.jl
+
+    element_ids_dg = zero(CuArray{Int64}(undef, length(alpha)))
+    element_ids_dgfv = zero(CuArray{Int64}(undef, length(alpha)))
+
+    pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg,
+                                                                                              element_ids_dgfv,
+                                                                                              alpha,
+                                                                                              atol)
+    pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol;
+                                      configurator_1d(pure_blended_element_count_kernel, alpha)...)
+
+    derivative_split = dg.basis.derivative_split
+    set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!`
+
+    derivative_split = CuArray{Float64}(derivative_split)
+    volume_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
+    noncons_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
+
+    inverse_weights = CuArray{Float64}(dg.basis.inverse_weights)
+    fstar1_L = zero(CuArray{Float64}(undef, size(u, 1), size(u, 2) + 1, size(u, 3)))
+    fstar1_R = zero(CuArray{Float64}(undef, size(u, 1), size(u, 2) + 1, size(u, 3)))
+
+    size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 3))
+
+    volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr,
+                                                                          noncons_flux_arr,
+                                                                          fstar1_L, fstar1_R, u,
+                                                                          element_ids_dgfv,
+                                                                          derivative_split,
+                                                                          equations, volume_flux_dg,
+                                                                          nonconservative_flux_dg,
+                                                                          volume_flux_fv,
+                                                                          nonconservative_flux_fv)
+    volume_flux_dgfv_kernel(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
+                            element_ids_dgfv,
+                            derivative_split, equations, volume_flux_dg, nonconservative_flux_dg,
+                            volume_flux_fv, nonconservative_flux_fv;
+                            configurator_2d(volume_flux_dgfv_kernel, size_arr)...)
+
     return nothing
 end
 
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index 928de48..c8cc424 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -191,6 +191,9 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L,
 
         element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
 
+        # The sets of `get_node_vars` operations may be combined
+        # into a single set of operation for better performance (to be explored)
+
         u_node = get_node_vars(u, equations, j1, j2, k)
         u_node1 = get_node_vars(u, equations, j3, j2, k)
         u_node2 = get_node_vars(u, equations, j1, j3, k)
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index e280464..5f3cb98 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -143,6 +143,9 @@ function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flu
 
         element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
 
+        # The sets of `get_node_vars` operations may be combined
+        # into a single set of operation for better performance (to be explored)
+
         u_node = get_node_vars(u, equations, j1, j2, j3, k)
         u_node1 = get_node_vars(u, equations, j4, j2, j3, k)
         u_node2 = get_node_vars(u, equations, j1, j4, j3, k)
diff --git a/test/test_script.jl b/test/test_script.jl
index 964a894..08e14df 100644
--- a/test/test_script.jl
+++ b/test/test_script.jl
@@ -3,37 +3,81 @@ using OrdinaryDiffEq
 using CUDA
 using Test
 
-equations = CompressibleEulerEquations3D(1.4)
+equations = ShallowWaterEquations1D(gravity_constant = 9.812, H0 = 1.75)
 
-initial_condition = initial_condition_weak_blast_wave
+# Initial condition with a truly discontinuous velocity and bottom topography.
+# Works as intended for TreeMesh1D with `initial_refinement_level=3`. If the mesh
+# refinement level is changed the initial condition below may need changed as well to
+# ensure that the discontinuities lie on an element interface.
+function initial_condition_stone_throw_discontinuous_bottom(x, t,
+                                                            equations::ShallowWaterEquations1D)
+
+    # Calculate primitive variables
+
+    # flat lake
+    H = equations.H0
+
+    # Discontinuous velocity
+    v = 0.0
+    if x[1] >= -0.75 && x[1] <= 0.0
+        v = -1.0
+    elseif x[1] >= 0.0 && x[1] <= 0.75
+        v = 1.0
+    end
+
+    b = (1.5 / exp(0.5 * ((x[1] - 1.0)^2)) +
+         0.75 / exp(0.5 * ((x[1] + 1.0)^2)))
+
+    # Force a discontinuous bottom topography
+    if x[1] >= -1.5 && x[1] <= 0.0
+        b = 0.5
+    end
+
+    return prim2cons(SVector(H, v, b), equations)
+end
+
+initial_condition = initial_condition_stone_throw_discontinuous_bottom
+
+boundary_condition = boundary_condition_slip_wall
+
+###############################################################################
+# Get the DG approximation space
+
+volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+surface_flux = (FluxHydrostaticReconstruction(flux_lax_friedrichs,
+                                              hydrostatic_reconstruction_audusse_etal),
+                flux_nonconservative_audusse_etal)
+basis = LobattoLegendreBasis(4)
 
-surface_flux = flux_ranocha # OBS! Using a non-dissipative flux is only sensible to test EC,
-# but not for real shock simulations
-volume_flux = flux_ranocha
-polydeg = 3
-basis = LobattoLegendreBasis(polydeg)
 indicator_sc = IndicatorHennemannGassner(equations, basis,
                                          alpha_max = 0.5,
                                          alpha_min = 0.001,
                                          alpha_smooth = true,
-                                         variable = density_pressure)
+                                         variable = waterheight_pressure)
 volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
                                                  volume_flux_dg = volume_flux,
                                                  volume_flux_fv = surface_flux)
+
 solver = DGSEM(basis, surface_flux, volume_integral)
 
-coordinates_min = (-2.0, -2.0, -2.0)
-coordinates_max = (2.0, 2.0, 2.0)
+###############################################################################
+# Create the TreeMesh for the domain [-3, 3]
+
+coordinates_min = -3.0
+coordinates_max = 3.0
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 3,
-                n_cells_max = 100_000)
+                n_cells_max = 10_000,
+                periodicity = false)
 
-semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+# create the semi discretization object
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    boundary_conditions = boundary_condition)
 
 ###############################################################################
-# ODE solvers, callbacks etc.
+# ODE solver
 
-tspan = (0.0, 0.4)
+tspan = (0.0, 3.0)
 
 # Get CPU data
 (; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi
@@ -64,85 +108,85 @@ TrixiGPU.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
                                Trixi.have_nonconservative_terms(equations_gpu),
                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
                                cache_gpu)
-Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-                            equations, solver.volume_integral, solver, cache)
-@test CUDA.@allowscalar du_gpu ≈ du
-
-# Test `cuda_prolong2interfaces!`
-TrixiGPU.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-interfaces_u_gpu = replace(cache_gpu.interfaces.u, NaN => 0.0)
-interfaces_u = replace(cache.interfaces.u, NaN => 0.0)
-@test interfaces_u_gpu ≈ interfaces_u
-
-# Test `cuda_interface_flux!`
-TrixiGPU.cuda_interface_flux!(mesh_gpu, Trixi.have_nonconservative_terms(equations_gpu),
-                              equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                           Trixi.have_nonconservative_terms(equations), equations,
-                           solver.surface_integral, solver, cache)
-surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-@test surface_flux_values_gpu ≈ surface_flux_values
-
-# Test `cuda_prolong2boundaries!`
-TrixiGPU.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu,
-                                  cache_gpu)
-Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-boundaries_u_gpu = replace(cache_gpu.boundaries.u, NaN => 0.0)
-boundaries_u = replace(cache.boundaries.u, NaN => 0.0)
-@test boundaries_u_gpu ≈ boundaries_u
-
-# Test `cuda_boundary_flux!`
-TrixiGPU.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-                             Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
-                             solver_gpu, cache_gpu)
-Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                          solver.surface_integral, solver)
-surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-@test surface_flux_values_gpu ≈ surface_flux_values
-
-# Test `cuda_prolong2mortars!`
-TrixiGPU.cuda_prolong2mortars!(u_gpu, mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
-                               solver_gpu, cache_gpu)
-Trixi.prolong2mortars!(cache, u, mesh, equations,
-                       solver.mortar, solver.surface_integral, solver)
-u_upper_left_gpu = replace(cache_gpu.mortars.u_upper_left, NaN => 0.0)
-u_upper_right_gpu = replace(cache_gpu.mortars.u_upper_right, NaN => 0.0)
-u_lower_left_gpu = replace(cache_gpu.mortars.u_lower_left, NaN => 0.0)
-u_lower_right_gpu = replace(cache_gpu.mortars.u_lower_right, NaN => 0.0)
-u_upper_left = replace(cache.mortars.u_upper_left, NaN => 0.0)
-u_upper_right = replace(cache.mortars.u_upper_right, NaN => 0.0)
-u_lower_left = replace(cache.mortars.u_lower_left, NaN => 0.0)
-u_lower_right = replace(cache.mortars.u_lower_right, NaN => 0.0)
-@test u_upper_left_gpu ≈ u_upper_left
-@test u_upper_right_gpu ≈ u_upper_right
-@test u_lower_left_gpu ≈ u_lower_left
-@test u_lower_right_gpu ≈ u_lower_right
-
-# Test `cuda_mortar_flux!`
-TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
-                           Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
-                           solver_gpu, cache_gpu)
-Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
-                        Trixi.have_nonconservative_terms(equations), equations,
-                        solver.mortar, solver.surface_integral, solver, cache)
-surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-@test surface_flux_values_gpu ≈ surface_flux_values
-
-# Test `cuda_surface_integral!`
-TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
-@test CUDA.@allowscalar du_gpu ≈ du
-
-# Test `cuda_jacobian!`
-TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-@test CUDA.@allowscalar du_gpu ≈ du
-
-# Test `cuda_sources!`
-TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
-Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-@test CUDA.@allowscalar du_gpu ≈ du
+# Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+#                             equations, solver.volume_integral, solver, cache)
+# @test CUDA.@allowscalar du_gpu ≈ du
+
+# # Test `cuda_prolong2interfaces!`
+# TrixiGPU.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+# interfaces_u_gpu = replace(cache_gpu.interfaces.u, NaN => 0.0)
+# interfaces_u = replace(cache.interfaces.u, NaN => 0.0)
+# @test interfaces_u_gpu ≈ interfaces_u
+
+# # Test `cuda_interface_flux!`
+# TrixiGPU.cuda_interface_flux!(mesh_gpu, Trixi.have_nonconservative_terms(equations_gpu),
+#                               equations_gpu, solver_gpu, cache_gpu)
+# Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+#                            Trixi.have_nonconservative_terms(equations), equations,
+#                            solver.surface_integral, solver, cache)
+# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+# @test surface_flux_values_gpu ≈ surface_flux_values
+
+# # Test `cuda_prolong2boundaries!`
+# TrixiGPU.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu,
+#                                   cache_gpu)
+# Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+# boundaries_u_gpu = replace(cache_gpu.boundaries.u, NaN => 0.0)
+# boundaries_u = replace(cache.boundaries.u, NaN => 0.0)
+# @test boundaries_u_gpu ≈ boundaries_u
+
+# # Test `cuda_boundary_flux!`
+# TrixiGPU.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+#                              Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+#                              solver_gpu, cache_gpu)
+# Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+#                           solver.surface_integral, solver)
+# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+# @test surface_flux_values_gpu ≈ surface_flux_values
+
+# # Test `cuda_prolong2mortars!`
+# TrixiGPU.cuda_prolong2mortars!(u_gpu, mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+#                                solver_gpu, cache_gpu)
+# Trixi.prolong2mortars!(cache, u, mesh, equations,
+#                        solver.mortar, solver.surface_integral, solver)
+# u_upper_left_gpu = replace(cache_gpu.mortars.u_upper_left, NaN => 0.0)
+# u_upper_right_gpu = replace(cache_gpu.mortars.u_upper_right, NaN => 0.0)
+# u_lower_left_gpu = replace(cache_gpu.mortars.u_lower_left, NaN => 0.0)
+# u_lower_right_gpu = replace(cache_gpu.mortars.u_lower_right, NaN => 0.0)
+# u_upper_left = replace(cache.mortars.u_upper_left, NaN => 0.0)
+# u_upper_right = replace(cache.mortars.u_upper_right, NaN => 0.0)
+# u_lower_left = replace(cache.mortars.u_lower_left, NaN => 0.0)
+# u_lower_right = replace(cache.mortars.u_lower_right, NaN => 0.0)
+# @test u_upper_left_gpu ≈ u_upper_left
+# @test u_upper_right_gpu ≈ u_upper_right
+# @test u_lower_left_gpu ≈ u_lower_left
+# @test u_lower_right_gpu ≈ u_lower_right
+
+# # Test `cuda_mortar_flux!`
+# TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
+#                            Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
+#                            solver_gpu, cache_gpu)
+# Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+#                         Trixi.have_nonconservative_terms(equations), equations,
+#                         solver.mortar, solver.surface_integral, solver, cache)
+# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
+# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
+# @test surface_flux_values_gpu ≈ surface_flux_values
+
+# # Test `cuda_surface_integral!`
+# TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+# Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
+# @test CUDA.@allowscalar du_gpu ≈ du
+
+# # Test `cuda_jacobian!`
+# TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+# @test CUDA.@allowscalar du_gpu ≈ du
+
+# # Test `cuda_sources!`
+# TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
+# Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+# @test CUDA.@allowscalar du_gpu ≈ du

From 144b2743d1e2a024017b7e004c2bf50b5dd76737 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Tue, 10 Sep 2024 18:04:42 -1000
Subject: [PATCH 3/5] Minor fix

---
 src/solvers/dg_1d.jl                          | 71 +++++++++++++++++--
 src/solvers/dg_3d.jl                          |  5 +-
 ...odic.jl => test_shallowwater_dirichlet.jl} | 16 +----
 3 files changed, 71 insertions(+), 21 deletions(-)
 rename test/{test_shallowwater_nonperiodic.jl => test_shallowwater_dirichlet.jl} (96%)

diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index 4486c8d..b6689f9 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -85,8 +85,9 @@ function symmetric_noncons_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u,
 
         @inbounds begin
             for ii in axes(u, 1)
-                symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii]
-                noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] * derivative_split[j1, j2]
+                symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii] *
+                                                    derivative_split[j1, j2]
+                noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii]
             end
         end
     end
@@ -123,8 +124,8 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, nonco
             integral_contribution = 0.0 # change back to `Float32`
 
             for ii in axes(du, 2)
-                du[i, j, k] += derivative_split[j, ii] * symmetric_flux_arr[i, j, ii, k]
-                integral_contribution += noncons_flux_arr[i, j, ii, k]
+                du[i, j, k] += symmetric_flux_arr[i, j, ii, k]
+                integral_contribution += derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]
             end
 
             du[i, j, k] += 0.5 * integral_contribution # change back to `Float32`
@@ -702,6 +703,63 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, f
     return nothing
 end
 
+# # Kernel for calculating symmetric and nonconservative volume integrals
+# function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, noncons_flux_arr)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
+#         @inbounds begin
+#             integral_contribution = 0.0 # change back to `Float32`
+
+#             for ii in axes(du, 2)
+#                 du[i, j, k] += derivative_split[j, ii] * symmetric_flux_arr[i, j, ii, k]
+#                 integral_contribution += noncons_flux_arr[i, j, ii, k]
+#             end
+
+#             du[i, j, k] += 0.5 * integral_contribution # change back to `Float32`
+#         end
+#     end
+
+#     return nothing
+# end
+
+# # Kernel for calculating DG volume integral contribution
+# function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
+#                                     volume_flux_arr, noncons_flux_arr)
+#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+#     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
+#         # length(element_ids_dg) == size(du, 3)
+#         # length(element_ids_dgfv) == size(du, 3)
+
+#         element_dg = element_ids_dg[k] # check if `element_dg` is zero
+#         element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+#         alpha_element = alpha[k]
+
+#         @inbounds begin
+#             if element_dg !== 0 # bad
+#                 for ii in axes(du, 2)
+#                     du[i, j, element_dg] += derivative_split[j, ii] *
+#                                             volume_flux_arr[i, j, ii, element_dg]
+#                 end
+#             end
+
+#             if element_dgfv !== 0 # bad
+#                 for ii in axes(du, 2)
+#                     du[i, j, element_dgfv] += (1 - alpha_element) * derivative_split[j, ii] *
+#                                               volume_flux_arr[i, j, ii, element_dgfv]
+#                 end
+#             end
+#         end
+#     end
+
+#     return nothing
+# end
+
 # Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations,
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
@@ -750,9 +808,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
                                                                           volume_flux_fv,
                                                                           nonconservative_flux_fv)
     volume_flux_dgfv_kernel(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
-                            element_ids_dgfv,
-                            derivative_split, equations, volume_flux_dg, nonconservative_flux_dg,
-                            volume_flux_fv, nonconservative_flux_fv;
+                            element_ids_dgfv, derivative_split, equations, volume_flux_dg,
+                            nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv;
                             configurator_2d(volume_flux_dgfv_kernel, size_arr)...)
 
     return nothing
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 5f3cb98..08a5f40 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -945,7 +945,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
                                volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM, cache)
     volume_flux = volume_integral.volume_flux
 
-    derivative_split = CuArray{Float64}(dg.basis.derivative_split)
+    derivative_split = dg.basis.derivative_split
+    set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!`
+
+    derivative_split = CuArray{Float64}(derivative_split)
     volume_flux_arr1 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
                                         size(u, 2), size(u, 5))
     volume_flux_arr2 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
diff --git a/test/test_shallowwater_nonperiodic.jl b/test/test_shallowwater_dirichlet.jl
similarity index 96%
rename from test/test_shallowwater_nonperiodic.jl
rename to test/test_shallowwater_dirichlet.jl
index 93ef131..4469892 100644
--- a/test/test_shallowwater_nonperiodic.jl
+++ b/test/test_shallowwater_dirichlet.jl
@@ -11,19 +11,9 @@ isdir(outdir) && rm(outdir, recursive = true)
 # Test precision of the semidiscretization process
 @testset "Test Shallow Water" begin
     @testset "Shallow Water 1D" begin
-        equations = ShallowWaterEquations1D(gravity_constant = 1.0, H0 = 3.0)
+        equations = ShallowWaterEquations1D(gravity_constant = 9.81, H0 = 3.0)
 
-        function initial_condition_well_balancedness(x, t, equations::ShallowWaterEquations1D)
-            # Set the background values
-            H = equations.H0
-            v = 0.0
-
-            b = (1.5 / exp(0.5 * ((x[1] - 1.0)^2)) + 0.75 / exp(0.5 * ((x[1] + 1.0)^2)))
-
-            return prim2cons(SVector(H, v, b), equations)
-        end
-
-        initial_condition = initial_condition_well_balancedness
+        initial_condition = initial_condition_convergence_test
 
         boundary_condition = BoundaryConditionDirichlet(initial_condition)
 
@@ -43,7 +33,7 @@ isdir(outdir) && rm(outdir, recursive = true)
         semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
                                             boundary_conditions = boundary_condition)
 
-        tspan = (0.0, 100.0)
+        tspan = (0.0, 1.0)
 
         # Get CPU data
         (; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi

From 9838536ef92fcd7bfe40b7d370c9674f0ed28906 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Tue, 10 Sep 2024 18:20:57 -1000
Subject: [PATCH 4/5] Reformat

---
 examples/shallowwater_dirichlet_1d.jl | 17 +++--------------
 test/test_shallowwater_dirichlet.jl   |  1 -
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/examples/shallowwater_dirichlet_1d.jl b/examples/shallowwater_dirichlet_1d.jl
index 276f002..97159c4 100644
--- a/examples/shallowwater_dirichlet_1d.jl
+++ b/examples/shallowwater_dirichlet_1d.jl
@@ -6,20 +6,9 @@ using OrdinaryDiffEq
 ###############################################################################
 # Semidiscretization of the shallow water equations
 
-equations = ShallowWaterEquations1D(gravity_constant = 1.0, H0 = 3.0)
+equations = ShallowWaterEquations1D(gravity_constant = 9.81, H0 = 3.0)
 
-# An initial condition with constant total water height and zero velocities to test well-balancedness.
-function initial_condition_well_balancedness(x, t, equations::ShallowWaterEquations1D)
-    # Set the background values
-    H = equations.H0
-    v = 0.0
-
-    b = (1.5 / exp(0.5 * ((x[1] - 1.0)^2)) + 0.75 / exp(0.5 * ((x[1] + 1.0)^2)))
-
-    return prim2cons(SVector(H, v, b), equations)
-end
-
-initial_condition = initial_condition_well_balancedness
+initial_condition = initial_condition_convergence_test
 
 boundary_condition = BoundaryConditionDirichlet(initial_condition)
 
@@ -49,7 +38,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
 ###############################################################################
 # ODE solvers, callbacks etc.
 
-tspan = (0.0, 100.0)
+tspan = (0.0, 1.0)
 ode = semidiscretize_gpu(semi, tspan) # from TrixiGPU.jl
 
 summary_callback = SummaryCallback()
diff --git a/test/test_shallowwater_dirichlet.jl b/test/test_shallowwater_dirichlet.jl
index 7ddfab0..4469892 100644
--- a/test/test_shallowwater_dirichlet.jl
+++ b/test/test_shallowwater_dirichlet.jl
@@ -11,7 +11,6 @@ isdir(outdir) && rm(outdir, recursive = true)
 # Test precision of the semidiscretization process
 @testset "Test Shallow Water" begin
     @testset "Shallow Water 1D" begin
-
         equations = ShallowWaterEquations1D(gravity_constant = 9.81, H0 = 3.0)
 
         initial_condition = initial_condition_convergence_test

From fb45900979ea16548d9aaba5f08bb22df085cda1 Mon Sep 17 00:00:00 2001
From: huiyuxie <huiyuxie.sde@gmail.com>
Date: Wed, 11 Sep 2024 22:40:53 -1000
Subject: [PATCH 5/5] Complete 1D

---
 src/solvers/dg_1d.jl                     | 242 ++++++++++++-----------
 src/solvers/dg_2d.jl                     |   8 +-
 src/solvers/dg_3d.jl                     |   8 +-
 test/test_script.jl                      |  85 +-------
 test/test_shallowwater_shockcapturing.jl | 118 +++++++++++
 5 files changed, 258 insertions(+), 203 deletions(-)
 create mode 100644 test/test_shallowwater_shockcapturing.jl

diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index b6689f9..26f8b7c 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -181,9 +181,63 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u,
     return nothing
 end
 
+# Kernel for calculating pure DG and DG-FV volume fluxes
+function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
+                                  element_ids_dgfv, derivative_split,
+                                  equations::AbstractEquations{1},
+                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
+                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+
+    if (j <= size(u, 2)^2 && k <= size(u, 3))
+        # length(element_ids_dgfv) == size(u, 3)
+
+        j1 = div(j - 1, size(u, 2)) + 1
+        j2 = rem(j - 1, size(u, 2)) + 1
+
+        element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+
+        # The sets of `get_node_vars` operations may be combined
+        # into a single set of operation for better performance (to be explored)
+
+        u_node = get_node_vars(u, equations, j1, k)
+        u_node1 = get_node_vars(u, equations, j2, k)
+
+        volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations)
+        noncons_flux_node = nonconservative_flux_dg(u_node, u_node1, 1, equations)
+
+        @inbounds begin
+            for ii in axes(u, 1)
+                volume_flux_arr[ii, j1, j2, k] = derivative_split[j1, j2] * volume_flux_node[ii]
+                noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii]
+            end
+        end
+
+        if j1 !== 1 && j2 === 1 && element_dgfv !== 0 # bad
+            u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv)
+            u_rr = get_node_vars(u, equations, j1, element_dgfv)
+
+            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
+
+            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
+            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
+
+            @inbounds begin
+                for ii in axes(u, 1)
+                    fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_L_node[ii]
+                    fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_R_node[ii]
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
 # Kernel for calculating DG volume integral contribution
 function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr)
+                                    volume_flux_arr, equations::AbstractEquations{1})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -216,6 +270,53 @@ function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha,
     return nothing
 end
 
+# Kernel for calculating DG volume integral contribution
+function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
+                                    volume_flux_arr, noncons_flux_arr,
+                                    equations::AbstractEquations{1})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
+        # length(element_ids_dg) == size(du, 3)
+        # length(element_ids_dgfv) == size(du, 3)
+
+        element_dg = element_ids_dg[k] # check if `element_dg` is zero
+        element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
+        alpha_element = alpha[k]
+
+        @inbounds begin
+            if element_dg !== 0 # bad
+                integral_contribution = 0.0
+
+                for ii in axes(du, 2)
+                    du[i, j, element_dg] += volume_flux_arr[i, j, ii, element_dg]
+                    integral_contribution += derivative_split[j, ii] *
+                                             noncons_flux_arr[i, j, ii, element_dg]
+                end
+
+                du[i, j, element_dg] += 0.5 * integral_contribution
+            end
+
+            if element_dgfv !== 0 # bad
+                integral_contribution = 0.0
+
+                for ii in axes(du, 2)
+                    du[i, j, element_dgfv] += (1 - alpha_element) *
+                                              volume_flux_arr[i, j, ii, element_dgfv]
+                    integral_contribution += derivative_split[j, ii] *
+                                             noncons_flux_arr[i, j, ii, element_dgfv]
+                end
+
+                du[i, j, element_dgfv] += 0.5 * (1 - alpha_element) * integral_contribution
+            end
+        end
+    end
+
+    return nothing
+end
+
 # Kernel for calculating FV volume integral contribution 
 function volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv,
                                     alpha)
@@ -477,7 +578,7 @@ end
 
 # Kernel for calculating source terms
 function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{1},
-                              source_terms::Function)
+                              source_terms::Any)
     j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
@@ -631,9 +732,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
                                                                               element_ids_dgfv,
                                                                               alpha,
                                                                               derivative_split,
-                                                                              volume_flux_arr)
+                                                                              volume_flux_arr,
+                                                                              equations)
     volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr;
+                              volume_flux_arr, equations;
                               configurator_3d(volume_integral_dg_kernel, du)...)
 
     size_arr = CuArray{Float64}(undef, size(u, 2), size(u, 3))
@@ -649,117 +751,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
     return nothing
 end
 
-# Kernel for calculating pure DG and DG-FV volume fluxes
-function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u,
-                                  element_ids_dgfv, derivative_split,
-                                  equations::AbstractEquations{1},
-                                  volume_flux_dg::Any, nonconservative_flux_dg::Any,
-                                  volume_flux_fv::Any, nonconservative_flux_fv::Any)
-    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-
-    if (j <= size(u, 2)^2 && k <= size(u, 3))
-        # length(element_ids_dgfv) == size(u, 3)
-
-        j1 = div(j - 1, size(u, 2)) + 1
-        j2 = rem(j - 1, size(u, 2)) + 1
-
-        element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-
-        # The sets of `get_node_vars` operations may be combined
-        # into a single set of operation for better performance (to be explored)
-
-        u_node = get_node_vars(u, equations, j1, k)
-        u_node1 = get_node_vars(u, equations, j2, k)
-
-        volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations)
-        noncons_flux_node = nonconservative_flux_dg(u_node, u_node1, 1, equations)
-
-        @inbounds begin
-            for ii in axes(u, 1)
-                volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii]
-                noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] * derivative_split[j1, j2]
-            end
-        end
-
-        if j1 !== 1 && j2 === 1 && element_dgfv !== 0 # bad
-            u_ll = get_node_vars(u, equations, j1 - 1, element_dgfv)
-            u_rr = get_node_vars(u, equations, j1, element_dgfv)
-
-            f1_node = volume_flux_fv(u_ll, u_rr, 1, equations)
-
-            f1_L_node = nonconservative_flux_fv(u_ll, u_rr, 1, equations)
-            f1_R_node = nonconservative_flux_fv(u_rr, u_ll, 1, equations)
-
-            @inbounds begin
-                for ii in axes(u, 1)
-                    fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_L_node[ii]
-                    fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_R_node[ii]
-                end
-            end
-        end
-    end
-
-    return nothing
-end
-
-# # Kernel for calculating symmetric and nonconservative volume integrals
-# function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, noncons_flux_arr)
-#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-#     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
-#         @inbounds begin
-#             integral_contribution = 0.0 # change back to `Float32`
-
-#             for ii in axes(du, 2)
-#                 du[i, j, k] += derivative_split[j, ii] * symmetric_flux_arr[i, j, ii, k]
-#                 integral_contribution += noncons_flux_arr[i, j, ii, k]
-#             end
-
-#             du[i, j, k] += 0.5 * integral_contribution # change back to `Float32`
-#         end
-#     end
-
-#     return nothing
-# end
-
-# # Kernel for calculating DG volume integral contribution
-# function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-#                                     volume_flux_arr, noncons_flux_arr)
-#     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-#     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-#     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-#     if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
-#         # length(element_ids_dg) == size(du, 3)
-#         # length(element_ids_dgfv) == size(du, 3)
-
-#         element_dg = element_ids_dg[k] # check if `element_dg` is zero
-#         element_dgfv = element_ids_dgfv[k] # check if `element_dgfv` is zero
-#         alpha_element = alpha[k]
-
-#         @inbounds begin
-#             if element_dg !== 0 # bad
-#                 for ii in axes(du, 2)
-#                     du[i, j, element_dg] += derivative_split[j, ii] *
-#                                             volume_flux_arr[i, j, ii, element_dg]
-#                 end
-#             end
-
-#             if element_dgfv !== 0 # bad
-#                 for ii in axes(du, 2)
-#                     du[i, j, element_dgfv] += (1 - alpha_element) * derivative_split[j, ii] *
-#                                               volume_flux_arr[i, j, ii, element_dgfv]
-#                 end
-#             end
-#         end
-#     end
-
-#     return nothing
-# end
-
 # Pack kernels for calculating volume integrals
 function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations,
                                volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache)
@@ -812,6 +803,27 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
                             nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv;
                             configurator_2d(volume_flux_dgfv_kernel, size_arr)...)
 
+    volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg,
+                                                                              element_ids_dgfv,
+                                                                              alpha,
+                                                                              derivative_split,
+                                                                              volume_flux_arr,
+                                                                              noncons_flux_arr,
+                                                                              equations)
+    volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
+                              volume_flux_arr, noncons_flux_arr, equations;
+                              configurator_3d(volume_integral_dg_kernel, du)...)
+
+    size_arr = CuArray{Float64}(undef, size(u, 2), size(u, 3))
+
+    volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L,
+                                                                              fstar1_R,
+                                                                              inverse_weights,
+                                                                              element_ids_dgfv,
+                                                                              alpha)
+    volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha;
+                              configurator_2d(volume_integral_fv_kernel, size_arr)...)
+
     return nothing
 end
 
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index c8cc424..965fab8 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -240,7 +240,8 @@ end
 
 # Kernel for calculating DG volume integral contribution
 function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2)
+                                    volume_flux_arr1, volume_flux_arr2,
+                                    equations::AbstractEquations{2})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -979,9 +980,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
                                                                               alpha,
                                                                               derivative_split,
                                                                               volume_flux_arr1,
-                                                                              volume_flux_arr2)
+                                                                              volume_flux_arr2,
+                                                                              equations)
     volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2;
+                              volume_flux_arr1, volume_flux_arr2, equations;
                               configurator_3d(volume_integral_dg_kernel, size_arr)...)
 
     size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4))
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 08a5f40..8b106e2 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -208,7 +208,8 @@ end
 
 # Kernel for calculating DG volume integral contribution
 function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                                    volume_flux_arr1, volume_flux_arr2, volume_flux_arr3)
+                                    volume_flux_arr1, volume_flux_arr2, volume_flux_arr3,
+                                    equations::AbstractEquations{3})
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -1061,9 +1062,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
                                                                               derivative_split,
                                                                               volume_flux_arr1,
                                                                               volume_flux_arr2,
-                                                                              volume_flux_arr3)
+                                                                              volume_flux_arr3,
+                                                                              equations)
     volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split,
-                              volume_flux_arr1, volume_flux_arr2, volume_flux_arr3;
+                              volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, equations;
                               configurator_3d(volume_integral_dg_kernel, size_arr)...)
 
     size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5))
diff --git a/test/test_script.jl b/test/test_script.jl
index 08e14df..38a575b 100644
--- a/test/test_script.jl
+++ b/test/test_script.jl
@@ -108,85 +108,6 @@ TrixiGPU.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
                                Trixi.have_nonconservative_terms(equations_gpu),
                                equations_gpu, solver_gpu.volume_integral, solver_gpu,
                                cache_gpu)
-# Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
-#                             equations, solver.volume_integral, solver, cache)
-# @test CUDA.@allowscalar du_gpu ≈ du
-
-# # Test `cuda_prolong2interfaces!`
-# TrixiGPU.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
-# Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
-# interfaces_u_gpu = replace(cache_gpu.interfaces.u, NaN => 0.0)
-# interfaces_u = replace(cache.interfaces.u, NaN => 0.0)
-# @test interfaces_u_gpu ≈ interfaces_u
-
-# # Test `cuda_interface_flux!`
-# TrixiGPU.cuda_interface_flux!(mesh_gpu, Trixi.have_nonconservative_terms(equations_gpu),
-#                               equations_gpu, solver_gpu, cache_gpu)
-# Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-#                            Trixi.have_nonconservative_terms(equations), equations,
-#                            solver.surface_integral, solver, cache)
-# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-# @test surface_flux_values_gpu ≈ surface_flux_values
-
-# # Test `cuda_prolong2boundaries!`
-# TrixiGPU.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu,
-#                                   cache_gpu)
-# Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
-# boundaries_u_gpu = replace(cache_gpu.boundaries.u, NaN => 0.0)
-# boundaries_u = replace(cache.boundaries.u, NaN => 0.0)
-# @test boundaries_u_gpu ≈ boundaries_u
-
-# # Test `cuda_boundary_flux!`
-# TrixiGPU.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
-#                              Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
-#                              solver_gpu, cache_gpu)
-# Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-#                           solver.surface_integral, solver)
-# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-# @test surface_flux_values_gpu ≈ surface_flux_values
-
-# # Test `cuda_prolong2mortars!`
-# TrixiGPU.cuda_prolong2mortars!(u_gpu, mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
-#                                solver_gpu, cache_gpu)
-# Trixi.prolong2mortars!(cache, u, mesh, equations,
-#                        solver.mortar, solver.surface_integral, solver)
-# u_upper_left_gpu = replace(cache_gpu.mortars.u_upper_left, NaN => 0.0)
-# u_upper_right_gpu = replace(cache_gpu.mortars.u_upper_right, NaN => 0.0)
-# u_lower_left_gpu = replace(cache_gpu.mortars.u_lower_left, NaN => 0.0)
-# u_lower_right_gpu = replace(cache_gpu.mortars.u_lower_right, NaN => 0.0)
-# u_upper_left = replace(cache.mortars.u_upper_left, NaN => 0.0)
-# u_upper_right = replace(cache.mortars.u_upper_right, NaN => 0.0)
-# u_lower_left = replace(cache.mortars.u_lower_left, NaN => 0.0)
-# u_lower_right = replace(cache.mortars.u_lower_right, NaN => 0.0)
-# @test u_upper_left_gpu ≈ u_upper_left
-# @test u_upper_right_gpu ≈ u_upper_right
-# @test u_lower_left_gpu ≈ u_lower_left
-# @test u_lower_right_gpu ≈ u_lower_right
-
-# # Test `cuda_mortar_flux!`
-# TrixiGPU.cuda_mortar_flux!(mesh_gpu, TrixiGPU.check_cache_mortars(cache_gpu),
-#                            Trixi.have_nonconservative_terms(equations_gpu), equations_gpu,
-#                            solver_gpu, cache_gpu)
-# Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
-#                         Trixi.have_nonconservative_terms(equations), equations,
-#                         solver.mortar, solver.surface_integral, solver, cache)
-# surface_flux_values_gpu = replace(cache_gpu.elements.surface_flux_values, NaN => 0.0)
-# surface_flux_values = replace(cache.elements.surface_flux_values, NaN => 0.0)
-# @test surface_flux_values_gpu ≈ surface_flux_values
-
-# # Test `cuda_surface_integral!`
-# TrixiGPU.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
-# Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache)
-# @test CUDA.@allowscalar du_gpu ≈ du
-
-# # Test `cuda_jacobian!`
-# TrixiGPU.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
-# Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
-# @test CUDA.@allowscalar du_gpu ≈ du
-
-# # Test `cuda_sources!`
-# TrixiGPU.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu)
-# Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
-# @test CUDA.@allowscalar du_gpu ≈ du
+Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+                            equations, solver.volume_integral, solver, cache)
+@test CUDA.@allowscalar du_gpu ≈ du
diff --git a/test/test_shallowwater_shockcapturing.jl b/test/test_shallowwater_shockcapturing.jl
new file mode 100644
index 0000000..8bb3e0b
--- /dev/null
+++ b/test/test_shallowwater_shockcapturing.jl
@@ -0,0 +1,118 @@
+module TestShallowWaterShock # with `nonconservative_terms::True`
+
+using Trixi, TrixiGPU
+using OrdinaryDiffEq
+using Test, CUDA
+
+# Start testing with a clean environment
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+# Test precision of the semidiscretization process
+@testset "Test Shallow Water" begin
+    @testset "Shallow Water 1D" begin
+        equations = ShallowWaterEquations1D(gravity_constant = 9.812, H0 = 1.75)
+
+        function initial_condition_stone_throw_discontinuous_bottom(x, t,
+                                                                    equations::ShallowWaterEquations1D)
+
+            # Calculate primitive variables
+
+            # Flat lake
+            H = equations.H0
+
+            # Discontinuous velocity
+            v = 0.0
+            if x[1] >= -0.75 && x[1] <= 0.0
+                v = -1.0
+            elseif x[1] >= 0.0 && x[1] <= 0.75
+                v = 1.0
+            end
+
+            b = (1.5 / exp(0.5 * ((x[1] - 1.0)^2)) +
+                 0.75 / exp(0.5 * ((x[1] + 1.0)^2)))
+
+            # Force a discontinuous bottom topography
+            if x[1] >= -1.5 && x[1] <= 0.0
+                b = 0.5
+            end
+
+            return prim2cons(SVector(H, v, b), equations)
+        end
+
+        initial_condition = initial_condition_stone_throw_discontinuous_bottom
+
+        boundary_condition = boundary_condition_slip_wall
+
+        volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+        surface_flux = (FluxHydrostaticReconstruction(flux_lax_friedrichs,
+                                                      hydrostatic_reconstruction_audusse_etal),
+                        flux_nonconservative_audusse_etal)
+        basis = LobattoLegendreBasis(4)
+
+        indicator_sc = IndicatorHennemannGassner(equations, basis,
+                                                 alpha_max = 0.5,
+                                                 alpha_min = 0.001,
+                                                 alpha_smooth = true,
+                                                 variable = waterheight_pressure)
+        volume_integral = VolumeIntegralShockCapturingHG(indicator_sc;
+                                                         volume_flux_dg = volume_flux,
+                                                         volume_flux_fv = surface_flux)
+
+        solver = DGSEM(basis, surface_flux, volume_integral)
+
+        coordinates_min = -3.0
+        coordinates_max = 3.0
+        mesh = TreeMesh(coordinates_min, coordinates_max,
+                        initial_refinement_level = 3,
+                        n_cells_max = 10_000,
+                        periodicity = false)
+
+        semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                            boundary_conditions = boundary_condition)
+
+        tspan = (0.0, 3.0)
+
+        # Get CPU data
+        (; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi
+
+        # Get GPU data
+        equations_gpu = deepcopy(equations)
+        mesh_gpu, solver_gpu, cache_gpu = deepcopy(mesh), deepcopy(solver), deepcopy(cache)
+        boundary_conditions_gpu, source_terms_gpu = deepcopy(boundary_conditions),
+                                                    deepcopy(source_terms)
+
+        # Set initial time
+        t = t_gpu = 0.0
+
+        # Get initial data
+        ode = semidiscretize(semi, tspan)
+        u_ode = copy(ode.u0)
+        du_ode = similar(u_ode)
+        u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+        du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+        # Copy data to device
+        du_gpu, u_gpu = TrixiGPU.copy_to_device!(du, u)
+        # Reset data on host
+        Trixi.reset_du!(du, solver, cache)
+
+        # Test `cuda_volume_integral!`
+        TrixiGPU.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                       Trixi.have_nonconservative_terms(equations_gpu),
+                                       equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                       cache_gpu)
+        Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+                                    equations, solver.volume_integral, solver, cache)
+        @test CUDA.@allowscalar du_gpu ≈ du
+
+        # Wait for fix of boundary flux dispatches
+
+        # Copy data back to host
+        du_cpu, u_cpu = TrixiGPU.copy_to_host!(du_gpu, u_gpu)
+    end
+
+    @testset "Shallow Water 2D" begin end
+end
+
+end # module