From 39853c298e59d328ba7e207092c7d44ccd211095 Mon Sep 17 00:00:00 2001
From: Huiyu Xie <huiyuxie.sde@gmail.com>
Date: Thu, 2 Jan 2025 21:45:31 -1000
Subject: [PATCH] Optimize volume integral kernel for flux differencing  (#105)

* Start

* Update README.md

* Fix typo

* Flux kernel

* Comments

* Complete 1D

* Complete 2D

* Complete 3D
---
 README.md                           |  12 +-
 benchmark/euler_ec_1d.jl            | 151 ++++++++++++++++
 benchmark/euler_ec_2d.jl            | 154 ++++++++++++++++
 benchmark/euler_ec_3d.jl            | 151 ++++++++++++++++
 src/solvers/dg_1d.jl                | 184 +++++++++++++------
 src/solvers/dg_2d.jl                | 231 ++++++++++++++++--------
 src/solvers/dg_3d.jl                | 263 ++++++++++++++++++----------
 test/tree_dgsem_2d/euler_ec.jl      | 130 ++++++++++++++
 test/tree_dgsem_2d/tree_dgsem_2d.jl |   1 +
 9 files changed, 1050 insertions(+), 227 deletions(-)
 create mode 100644 benchmark/euler_ec_1d.jl
 create mode 100644 benchmark/euler_ec_2d.jl
 create mode 100644 benchmark/euler_ec_3d.jl
 create mode 100644 test/tree_dgsem_2d/euler_ec.jl

diff --git a/README.md b/README.md
index f0e97c3d..85c533a2 100644
--- a/README.md
+++ b/README.md
@@ -8,19 +8,17 @@ TrixiCUDA.jl offers CUDA acceleration for solving hyperbolic PDEs.
 
 ⚠️ **Warning:** Our package may not always be updated with the latest updates or improvements in Trixi.jl. Forcing an update of Trixi.jl as a dependency for TrixiCUDA.jl beyond the version bounds specified in `Project.toml` may cause unexpected errors.
 
-*Update on Nov 21, 2024*: 
+*Update on Dec 31, 2024*:
+- The initial round of kernel optimization starts with the volume integral kernels (see [TrixiCUDA.jl PR #102](https://github.com/trixi-gpu/TrixiCUDA.jl/pull/102)) and will later extend to all existing kernels used in the semidiscretization process. The primary approach includes improving global memory access patterns, using shared memory, and selecting appropriate launch sizes to reduce warp stalls and achieve better occupancy.
+
+*Update on Nov 21, 2024*:
 - Due to the [incompatibility issue](https://github.com/trixi-framework/Trixi.jl/issues/2108) from upstream with Trixi.jl and CUDA.jl in Julia v1.11, this package now supports only Julia v1.10. Using or developing this package with Julia v1.11 will result in precompilation errors. To fix this, downgrade to Julia v1.10. If you have any other problems, please file issues [here](https://github.com/trixi-gpu/TrixiCUDA.jl/issues).
 
 *Update on Oct 30, 2024*: 
 - The general documentation is now available at https://trixi-gpu.github.io (in development).  
 - Documentation specific to this package can be found at https://trixi-gpu.github.io/TrixiCUDA.jl/dev (in development).
 
-*Update on Oct 11, 2024*:
-
-- Development on Julia v1.11 is currently on hold due to an issue between the latest version of CUDA.jl and Trixi.jl. The fix is in progress.
-    - Trace the root issue in [Trixi.jl Issue #1789](https://github.com/trixi-framework/Trixi.jl/issues/1789): SciMLBase.jl has dropped support for arrays of `SVector`.
-    - [Trixi.jl PR #2150](https://github.com/trixi-framework/Trixi.jl/pull/2150) is opened to replace arrays of `SVector` using RecursiveArrayTools.jl.
-    - It requires updating RecursiveArrayTools.jl, which is compatible with Julia >= v1.10. However, Trixi.jl has some legacy tests relying on Julia v1.8 and v1.9. See more discussions in [Trixi.jl PR #2194](https://github.com/trixi-framework/Trixi.jl/pull/2194).
+[Archived Update](https://trixi-gpu.github.io/update/)
 
 
 # Package Installation
diff --git a/benchmark/euler_ec_1d.jl b/benchmark/euler_ec_1d.jl
new file mode 100644
index 00000000..faf2b817
--- /dev/null
+++ b/benchmark/euler_ec_1d.jl
@@ -0,0 +1,151 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = CompressibleEulerEquations1D(1.4f0)
+
+initial_condition = initial_condition_weak_blast_wave
+
+volume_flux = flux_ranocha
+solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+               RealT = RealT)
+
+coordinates_min = (-2.0f0,)
+coordinates_max = (2.0f0,)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 5,
+                n_cells_max = 10_000, RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+
+tspan = tspan_gpu = (0.0f0, 0.4f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu = copy(ode_gpu.u0)
+du_gpu = similar(u_gpu)
+
+# # Reset du
+# @info "Time for reset_du! on GPU"
+# CUDA.@time TrixiCUDA.reset_du!(du_gpu)
+# @info "Time for reset_du! on CPU"
+# @time Trixi.reset_du!(du, solver, cache)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
+# @info "Time for reset_du! and volume_integral! on CPU"
+# @time begin
+#     Trixi.reset_du!(du, solver, cache)
+#     Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+#                                 equations, solver.volume_integral, solver, cache)
+# end
+
+# # Prolong to interfaces
+# @info "Time for prolong2interfaces! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for prolong2interfaces! on CPU"
+# @time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Interface flux
+# @info "Time for interface_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+#                                           Trixi.have_nonconservative_terms(equations_gpu),
+#                                           equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for interface_flux! on CPU"
+# @time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+#                                  Trixi.have_nonconservative_terms(equations), equations,
+#                                  solver.surface_integral, solver, cache)
+
+# # Prolong to boundaries
+# @info "Time for prolong2boundaries! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                               equations_gpu, cache_gpu)
+# @info "Time for prolong2boundaries! on CPU"
+# @time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Boundary flux
+# @info "Time for boundary_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                          Trixi.have_nonconservative_terms(equations_gpu),
+#                                          equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for boundary_flux! on CPU"
+# @time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+#                                 solver.surface_integral, solver)
+
+# # Prolong to mortars
+# @info "Time for prolong2mortars! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+#                                            TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                            solver_gpu, cache_gpu)
+# @info "Time for prolong2mortars! on CPU"
+# @time Trixi.prolong2mortars!(cache, u, mesh, equations,
+#                              solver.mortar, solver.surface_integral, solver)
+
+# # Mortar flux
+# @info "Time for mortar_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                        Trixi.have_nonconservative_terms(equations_gpu),
+#                                        equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for mortar_flux! on CPU"
+# @time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+#                               Trixi.have_nonconservative_terms(equations), equations,
+#                               solver.mortar, solver.surface_integral, solver, cache)
+
+# # Surface integral
+# @info "Time for surface_integral! on GPU"
+# CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for surface_integral! on CPU"
+# @time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
+#                                    solver, cache)
+
+# # Jacobian
+# @info "Time for jacobian! on GPU"
+# CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for jacobian! on CPU"
+# @time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+
+# # Sources terms
+# @info "Time for sources! on GPU"
+# CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
+#                                    equations_gpu, cache_gpu)
+# @info "Time for sources! on CPU"
+# @time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+
+# # Semidiscretization process
+# @info "Time for rhs! on GPU"
+# CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
+#                               boundary_conditions_gpu, source_terms_gpu,
+#                               solver_gpu, cache_gpu)
+# @info "Time for rhs! on CPU"
+# @time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
+#                  solver, cache)
diff --git a/benchmark/euler_ec_2d.jl b/benchmark/euler_ec_2d.jl
new file mode 100644
index 00000000..3655fc77
--- /dev/null
+++ b/benchmark/euler_ec_2d.jl
@@ -0,0 +1,154 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = CompressibleEulerEquations2D(1.4f0)
+
+initial_condition = initial_condition_weak_blast_wave
+
+volume_flux = flux_ranocha
+solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+               RealT = RealT)
+
+coordinates_min = (-2.0f0, -2.0f0)
+coordinates_max = (2.0f0, 2.0f0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 5,
+                n_cells_max = 10_000,
+                periodicity = true, RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                          boundary_conditions = boundary_condition_periodic)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
+                                                      boundary_conditions = boundary_condition_periodic)
+
+tspan = tspan_gpu = (0.0f0, 0.4f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu = copy(ode_gpu.u0)
+du_gpu = similar(u_gpu)
+
+# # Reset du
+# @info "Time for reset_du! on GPU"
+# CUDA.@time TrixiCUDA.reset_du!(du_gpu)
+# @info "Time for reset_du! on CPU"
+# @time Trixi.reset_du!(du, solver, cache)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
+# @info "Time for reset_du! and volume_integral! on CPU"
+# @time begin
+#     Trixi.reset_du!(du, solver, cache)
+#     Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+#                                 equations, solver.volume_integral, solver, cache)
+# end
+
+# # Prolong to interfaces
+# @info "Time for prolong2interfaces! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for prolong2interfaces! on CPU"
+# @time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Interface flux
+# @info "Time for interface_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+#                                           Trixi.have_nonconservative_terms(equations_gpu),
+#                                           equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for interface_flux! on CPU"
+# @time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+#                                  Trixi.have_nonconservative_terms(equations), equations,
+#                                  solver.surface_integral, solver, cache)
+
+# # Prolong to boundaries
+# @info "Time for prolong2boundaries! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                               equations_gpu, cache_gpu)
+# @info "Time for prolong2boundaries! on CPU"
+# @time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Boundary flux
+# @info "Time for boundary_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                          Trixi.have_nonconservative_terms(equations_gpu),
+#                                          equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for boundary_flux! on CPU"
+# @time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+#                                 solver.surface_integral, solver)
+
+# # Prolong to mortars
+# @info "Time for prolong2mortars! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+#                                            TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                            solver_gpu, cache_gpu)
+# @info "Time for prolong2mortars! on CPU"
+# @time Trixi.prolong2mortars!(cache, u, mesh, equations,
+#                              solver.mortar, solver.surface_integral, solver)
+
+# # Mortar flux
+# @info "Time for mortar_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                        Trixi.have_nonconservative_terms(equations_gpu),
+#                                        equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for mortar_flux! on CPU"
+# @time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+#                               Trixi.have_nonconservative_terms(equations), equations,
+#                               solver.mortar, solver.surface_integral, solver, cache)
+
+# # Surface integral
+# @info "Time for surface_integral! on GPU"
+# CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for surface_integral! on CPU"
+# @time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
+#                                    solver, cache)
+
+# # Jacobian
+# @info "Time for jacobian! on GPU"
+# CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for jacobian! on CPU"
+# @time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+
+# # Sources terms
+# @info "Time for sources! on GPU"
+# CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
+#                                    equations_gpu, cache_gpu)
+# @info "Time for sources! on CPU"
+# @time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+
+# # Semidiscretization process
+# @info "Time for rhs! on GPU"
+# CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
+#                               boundary_conditions_gpu, source_terms_gpu,
+#                               solver_gpu, cache_gpu)
+# @info "Time for rhs! on CPU"
+# @time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
+#                  solver, cache)
diff --git a/benchmark/euler_ec_3d.jl b/benchmark/euler_ec_3d.jl
new file mode 100644
index 00000000..83e08262
--- /dev/null
+++ b/benchmark/euler_ec_3d.jl
@@ -0,0 +1,151 @@
+using Trixi, TrixiCUDA
+using CUDA
+using BenchmarkTools
+
+# Set the precision
+RealT = Float32
+
+# Set up the problem
+equations = CompressibleEulerEquations3D(1.4f0)
+
+initial_condition = initial_condition_weak_blast_wave
+
+volume_flux = flux_ranocha
+solver = DGSEM(polydeg = 2, surface_flux = flux_ranocha,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux),
+               RealT = RealT)
+
+coordinates_min = (-2.0f0, -2.0f0, -2.0f0)
+coordinates_max = (2.0f0, 2.0f0, 2.0f0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 3,
+                n_cells_max = 100_000, RealT = RealT)
+
+# Cache initialization
+@info "Time for cache initialization on CPU"
+@time semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+@info "Time for cache initialization on GPU"
+CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver)
+
+tspan = tspan_gpu = (0.0f0, 0.4f0)
+t = t_gpu = 0.0f0
+
+# Semi on CPU
+(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+# Semi on GPU
+equations_gpu = semi_gpu.equations
+mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+boundary_conditions_gpu = semi_gpu.boundary_conditions
+source_terms_gpu = semi_gpu.source_terms
+
+# ODE on CPU
+ode = semidiscretize(semi, tspan)
+u_ode = copy(ode.u0)
+du_ode = similar(u_ode)
+u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+# ODE on GPU
+ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+u_gpu = copy(ode_gpu.u0)
+du_gpu = similar(u_gpu)
+
+# # Reset du
+# @info "Time for reset_du! on GPU"
+# CUDA.@time TrixiCUDA.reset_du!(du_gpu)
+# @info "Time for reset_du! on CPU"
+# @time Trixi.reset_du!(du, solver, cache)
+
+# Reset du and volume integral
+@info "Time for reset_du! and volume_integral! on GPU"
+@benchmark CUDA.@sync TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                                      Trixi.have_nonconservative_terms(equations_gpu),
+                                                      equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                                      cache_gpu)
+# @info "Time for reset_du! and volume_integral! on CPU"
+# @time begin
+#     Trixi.reset_du!(du, solver, cache)
+#     Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+#                                 equations, solver.volume_integral, solver, cache)
+# end
+
+# # Prolong to interfaces
+# @info "Time for prolong2interfaces! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for prolong2interfaces! on CPU"
+# @time Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Interface flux
+# @info "Time for interface_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+#                                           Trixi.have_nonconservative_terms(equations_gpu),
+#                                           equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for interface_flux! on CPU"
+# @time Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+#                                  Trixi.have_nonconservative_terms(equations), equations,
+#                                  solver.surface_integral, solver, cache)
+
+# # Prolong to boundaries
+# @info "Time for prolong2boundaries! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                               equations_gpu, cache_gpu)
+# @info "Time for prolong2boundaries! on CPU"
+# @time Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+
+# # Boundary flux
+# @info "Time for boundary_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+#                                          Trixi.have_nonconservative_terms(equations_gpu),
+#                                          equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for boundary_flux! on CPU"
+# @time Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+#                                 solver.surface_integral, solver)
+
+# # Prolong to mortars
+# @info "Time for prolong2mortars! on GPU"
+# CUDA.@time TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+#                                            TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                            solver_gpu, cache_gpu)
+# @info "Time for prolong2mortars! on CPU"
+# @time Trixi.prolong2mortars!(cache, u, mesh, equations,
+#                              solver.mortar, solver.surface_integral, solver)
+
+# # Mortar flux
+# @info "Time for mortar_flux! on GPU"
+# CUDA.@time TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+#                                        Trixi.have_nonconservative_terms(equations_gpu),
+#                                        equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for mortar_flux! on CPU"
+# @time Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+#                               Trixi.have_nonconservative_terms(equations), equations,
+#                               solver.mortar, solver.surface_integral, solver, cache)
+
+# # Surface integral
+# @info "Time for surface_integral! on GPU"
+# CUDA.@time TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+# @info "Time for surface_integral! on CPU"
+# @time Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
+#                                    solver, cache)
+
+# # Jacobian
+# @info "Time for jacobian! on GPU"
+# CUDA.@time TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+# @info "Time for jacobian! on CPU"
+# @time Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+
+# # Sources terms
+# @info "Time for sources! on GPU"
+# CUDA.@time TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
+#                                    equations_gpu, cache_gpu)
+# @info "Time for sources! on CPU"
+# @time Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+
+# # Semidiscretization process
+# @info "Time for rhs! on GPU"
+# CUDA.@time TrixiCUDA.rhs_gpu!(du_gpu, u_gpu, t_gpu, mesh_gpu, equations_gpu,
+#                               boundary_conditions_gpu, source_terms_gpu,
+#                               solver_gpu, cache_gpu)
+# @info "Time for rhs! on CPU"
+# @time Trixi.rhs!(du, u, t, mesh, equations, boundary_conditions, source_terms,
+#                  solver, cache)
diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
index 66fd2dde..7c32e8fe 100644
--- a/src/solvers/dg_1d.jl
+++ b/src/solvers/dg_1d.jl
@@ -6,15 +6,16 @@
 
 # Kernel for calculating fluxes along normal direction
 function flux_kernel!(flux_arr, u, equations::AbstractEquations{1}, flux::Any)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
-    if (i <= size(u, 1) && j <= size(u, 2) && k <= size(u, 3))
+    if (j <= size(u, 2) && k <= size(u, 3))
         u_node = get_node_vars(u, equations, j, k)
         flux_node = flux(u_node, 1, equations)
 
-        @inbounds flux_arr[i, j, k] = flux_node[i]
+        for ii in axes(u, 1)
+            @inbounds flux_arr[ii, j, k] = flux_node[ii]
+        end
     end
 
     return nothing
@@ -36,8 +37,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr)
     return nothing
 end
 
-# Kernel for calculating flux and weak form 
-# It is optimized using shared memory access and computation tiling
+# Kernel for calculating fluxes and weak form 
+# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{1}, flux::Any)
     # Set tile width
@@ -50,11 +51,7 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     shmem_flux = @cuDynamicSharedMem(eltype(du),
                                      (size(du, 1), tile_width), offset)
 
-    # Get the thread and block indices
-    # bx, by, bz = blockIdx().x, blockIdx().y, blockIdx().z
-    # tx, ty, tz = threadIdx().x, threadIdx().y, threadIdx().z
-
-    # Get thread and block indices we need to save registers
+    # Get thread and block indices only we need to save registers
     tx, ty = threadIdx().x, threadIdx().y
 
     # We construct more threads than we need to allocate shared memory
@@ -64,16 +61,14 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     ty2 = rem(ty - 1, tile_width) + 1
 
     # We launch one block in x direction so i = tx
-    # i = (bx - 1) * blockDim().x + tx
+    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     # Tile the computation (restrict to one tile here)
     value = zero(eltype(du))
 
     # Load global `derivative_dhat` into shared memory
-    # Note the memory access pattern matters here, transposed or not needs to be 
-    # considered for better performance
-    # TODO: Better memory access pattern
+    # Transposed memory access or not?
     @inbounds begin
         shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
     end
@@ -82,10 +77,7 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     # Note that `flux_arr` is removed for smaller GPU memory allocation
     u_node = get_node_vars(u, equations, ty1, k)
     flux_node = flux(u_node, 1, equations)
-    # @inbounds begin
-    #     shmem_flux[tx, ty1] = flux_arr[tx, ty1, k]
-    # end
-    # TODO: Better memory access pattern
+
     @inbounds begin
         shmem_flux[tx, ty1] = flux_node[tx]
     end
@@ -93,6 +85,9 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     sync_threads()
 
     # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2, 
+    # then consolidate each computation back to ty1
+    # How to replace shared memory `shmem_flux` with `flux_node`?
     for thread in 1:tile_width
         @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread]
     end
@@ -129,6 +124,91 @@ function volume_flux_kernel!(volume_flux_arr, u, equations::AbstractEquations{1}
     return nothing
 end
 
+# Kernel for calculating volume integrals
+function volume_integral_kernel!(du, derivative_split, volume_flux_arr,
+                                 equations::AbstractEquations{1})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
+        @inbounds du[i, j, k] = zero(eltype(du)) # fuse `reset_du!` here
+        for ii in axes(du, 2)
+            @inbounds du[i, j, k] += derivative_split[j, ii] * volume_flux_arr[i, j, ii, k]
+        end
+    end
+
+    return nothing
+end
+
+# Kernel for calculating volume fluxes and volume integrals
+# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
+function volume_flux_integral_kernel!(du, u, derivative_split,
+                                      equations::AbstractEquations{1}, volume_flux::Any)
+    # Set tile width
+    tile_width = size(du, 2)
+    offset = 0 # offset bytes for shared memory
+
+    # Allocate dynamic shared memory
+    shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
+    offset += sizeof(eltype(du)) * tile_width^2
+    shmem_vflux = @cuDynamicSharedMem(eltype(du),
+                                      (size(du, 1), tile_width, tile_width), offset)
+
+    # Get thread and block indices only we need to save registers
+    tx, ty = threadIdx().x, threadIdx().y
+
+    # We launch one block in y direction so j = ty
+    ty1 = div(ty - 1, tile_width) + 1 # same as j1
+    ty2 = rem(ty - 1, tile_width) + 1 # same as j2
+
+    # We launch one block in x direction so i = tx
+    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    # j1 = div(j - 1, tile_width) + 1 
+    # j2 = rem(j - 1, tile_width) + 1
+
+    # Tile the computation (restrict to one tile here)
+    value = zero(eltype(du))
+
+    # Load global `derivative_split` into shared memory
+    # Transposed memory access or not?
+    @inbounds begin
+        shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
+    end
+
+    # Load global `volume_flux_arr` into shared memory
+    # Note that `volume_flux_arr` is removed for smaller GPU memory allocation
+    u_node = get_node_vars(u, equations, ty1, k)
+    u_node1 = get_node_vars(u, equations, ty2, k)
+
+    volume_flux_node = volume_flux(u_node, u_node1, 1, equations)
+
+    @inbounds begin
+        shmem_vflux[tx, ty1, ty2] = volume_flux_node[tx]
+    end
+
+    sync_threads()
+
+    # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2, 
+    # then consolidate each computation back to ty1
+    # How to replace shared memory `shmem_vflux` with `volume_flux_node`?
+    for thread in 1:tile_width
+        @inbounds value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread]
+    end
+
+    # Synchronization is not needed here if we use only one tile
+    # sync_threads()
+
+    # Finalize the weak form
+    @inbounds du[tx, ty1, k] = value
+
+    return nothing
+end
+
 # Kernel for calculating symmetric and nonconservative fluxes
 function symmetric_noncons_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u, derivative_split,
                                         equations::AbstractEquations{1}, symmetric_flux::Any,
@@ -158,23 +238,6 @@ function symmetric_noncons_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u,
     return nothing
 end
 
-# Kernel for calculating volume integrals
-function volume_integral_kernel!(du, derivative_split, volume_flux_arr,
-                                 equations::AbstractEquations{1})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3))
-        @inbounds du[i, j, k] = zero(eltype(du)) # fuse `reset_du!` here
-        for ii in axes(du, 2)
-            @inbounds du[i, j, k] += derivative_split[j, ii] * volume_flux_arr[i, j, ii, k]
-        end
-    end
-
-    return nothing
-end
-
 # Kernel for calculating symmetric and nonconservative volume integrals
 function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, noncons_flux_arr)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@@ -677,22 +740,21 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms,
 
     # The maximum number of threads per block is the dominant factor when choosing the optimization 
     # method. However, there are other factors that may cause a launch failure, such as the maximum 
-    # number of registers per block. Here, we have omitted all other factors, but this should be 
-    # enhanced later for a safer kernel launch.
+    # shared memory per block. Here, we have omitted all other factors, but this should be enhanced 
+    # later for a safer kernel launch.
     # TODO: More checks before the kernel launch
-    thread_num_per_block = size(du, 1) * size(du, 2)^2
-    if thread_num_per_block > MAX_THREADS_PER_BLOCK
+    thread_per_block = size(du, 1) * size(du, 2)^2
+    if thread_per_block > MAX_THREADS_PER_BLOCK
         # TODO: How to optimize when size is large
         flux_arr = similar(u)
 
         flux_kernel = @cuda launch=false flux_kernel!(flux_arr, u, equations, flux)
         flux_kernel(flux_arr, u, equations, flux;
-                    kernel_configurator_3d(flux_kernel, size(u)...)...)
+                    kernel_configurator_2d(flux_kernel, size(u, 2), size(u, 3))...)
 
         weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr)
         weak_form_kernel(du, derivative_dhat, flux_arr;
                          kernel_configurator_3d(weak_form_kernel, size(du)...)...)
-
     else
         shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)) * sizeof(eltype(du))
         flux_weak_form_kernel = @cuda launch=false flux_weak_form_kernel!(du, u, derivative_dhat,
@@ -715,20 +777,34 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
     volume_flux = volume_integral.volume_flux
 
     derivative_split = dg.basis.derivative_split
-    set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!`
-
+    # TODO: Move `set_diagonal_to_zero!` outside of `rhs!` loop and cache the result in 
+    # DG struct on GPU
+    set_diagonal_to_zero!(derivative_split)
     derivative_split = CuArray(derivative_split)
-    volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
 
-    volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations,
-                                                                volume_flux)
-    volume_flux_kernel(volume_flux_arr, u, equations, volume_flux;
-                       kernel_configurator_2d(volume_flux_kernel, size(u, 2)^2, size(u, 3))...)
+    thread_per_block = size(du, 1) * size(du, 2)^2
+    if thread_per_block > MAX_THREADS_PER_BLOCK
+        # TODO: How to optimize when size is large
+        volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3))
 
-    volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
-                                                                        volume_flux_arr, equations)
-    volume_integral_kernel(du, derivative_split, volume_flux_arr, equations;
-                           kernel_configurator_3d(volume_integral_kernel, size(du)...)...)
+        volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations,
+                                                                    volume_flux)
+        volume_flux_kernel(volume_flux_arr, u, equations, volume_flux;
+                           kernel_configurator_2d(volume_flux_kernel, size(u, 2)^2, size(u, 3))...)
+
+        volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
+                                                                            volume_flux_arr, equations)
+        volume_integral_kernel(du, derivative_split, volume_flux_arr, equations;
+                               kernel_configurator_3d(volume_integral_kernel, size(du)...)...)
+    else
+        shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(eltype(du))
+        volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
+                                                                                      equations, volume_flux)
+        volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
+                                    shmem = shmem_size,
+                                    threads = (size(du, 1), size(du, 2)^2, 1),
+                                    blocks = (1, 1, size(du, 3)))
+    end
 
     return nothing
 end
diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
index 4b449422..b65cea68 100644
--- a/src/solvers/dg_2d.jl
+++ b/src/solvers/dg_2d.jl
@@ -6,11 +6,10 @@
 
 # Kernel for calculating fluxes along normal directions
 function flux_kernel!(flux_arr1, flux_arr2, u, equations::AbstractEquations{2}, flux::Any)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
-    if (i <= size(u, 1) && j <= size(u, 2)^2 && k <= size(u, 4))
+    if (j <= size(u, 2)^2 && k <= size(u, 4))
         j1 = div(j - 1, size(u, 2)) + 1
         j2 = rem(j - 1, size(u, 2)) + 1
 
@@ -19,9 +18,11 @@ function flux_kernel!(flux_arr1, flux_arr2, u, equations::AbstractEquations{2},
         flux_node1 = flux(u_node, 1, equations)
         flux_node2 = flux(u_node, 2, equations)
 
-        @inbounds begin
-            flux_arr1[i, j1, j2, k] = flux_node1[i]
-            flux_arr2[i, j1, j2, k] = flux_node2[i]
+        for ii in axes(u, 1)
+            @inbounds begin
+                flux_arr1[ii, j1, j2, k] = flux_node1[ii]
+                flux_arr2[ii, j1, j2, k] = flux_node2[ii]
+            end
         end
     end
 
@@ -51,8 +52,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
     return nothing
 end
 
-# Kernel for calculating flux and weak form
-# It is optimized using shared memory access and computation tiling
+# Kernel for calculating fluxes and weak form
+# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{2}, flux::Any)
     # Set tile width
@@ -63,13 +64,10 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     shmem_dhat = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
     offset += sizeof(eltype(du)) * tile_width^2
     shmem_flux = @cuDynamicSharedMem(eltype(du),
-                                     (size(du, 1), tile_width, tile_width, 2), offset)
-
-    # Get thread and block indices
-    # bx, by, bz = blockIdx().x, blockIdx().y, blockIdx().z
-    # tx, ty, tz = threadIdx().x, threadIdx().y, threadIdx().z
+                                     (size(du, 1), tile_width, tile_width, 2),
+                                     offset)
 
-    # Get thread and block indices we need to save registers
+    # Get thread and block indices only we need to save registers
     tx, ty = threadIdx().x, threadIdx().y
 
     # We launch one block in y direction so j = ty
@@ -88,25 +86,17 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     value = zero(eltype(du))
 
     # Load global `derivative_dhat` into shared memory
-    # Note the memory access pattern matters here, transposed or not needs to be 
-    # considered for better performance
-    # TODO: Better memory access pattern
+    # Transposed memory access or not?
     @inbounds begin
         shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
     end
 
-    # Load global `flux_arr1` into shared memory
-    # Load global `flux_arr2` into shared memory
-    # Note that `flux_arr1` and `flux_arr2` are removed for smaller 
-    # GPU memory allocation
+    # Load global `flux_arr1` and `flux_arr2` into shared memory, and note 
+    # that they are removed now for smaller GPU memory allocation
     u_node = get_node_vars(u, equations, ty1, ty2, k)
     flux_node1 = flux(u_node, 1, equations)
     flux_node2 = flux(u_node, 2, equations)
-    # @inbounds begin
-    #     shmem_flux[tx, ty1, ty2, 1] = flux_arr1[tx, ty1, ty2, k]
-    #     shmem_flux[tx, ty1, ty2, 2] = flux_arr2[tx, ty1, ty2, k]
-    # end
-    # TODO: Better memory access pattern
+
     @inbounds begin
         shmem_flux[tx, ty1, ty2, 1] = flux_node1[tx]
         shmem_flux[tx, ty1, ty2, 2] = flux_node2[tx]
@@ -115,6 +105,8 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     sync_threads()
 
     # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts
+    # How to replace shared memory `shmem_flux` with `flux_node`?
     for thread in 1:tile_width
         @inbounds begin
             value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1]
@@ -162,6 +154,107 @@ function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, u, equations::A
     return nothing
 end
 
+# Kernel for calculating volume integrals
+function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
+                                 equations::AbstractEquations{2})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4))
+        j1 = div(j - 1, size(du, 2)) + 1
+        j2 = rem(j - 1, size(du, 2)) + 1
+
+        @inbounds du[i, j1, j2, k] = zero(eltype(du)) # fuse `reset_du!` here
+
+        for ii in axes(du, 2)
+            @inbounds begin
+                du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k]
+                du[i, j1, j2, k] += derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
+            end
+        end
+    end
+
+    return nothing
+end
+
+# Kernel for calculating volume fluxes and volume integrals
+# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
+function volume_flux_integral_kernel!(du, u, derivative_split,
+                                      equations::AbstractEquations{2}, volume_flux::Any)
+    # Set tile width
+    tile_width = size(du, 2)
+    offset = 0 # offset bytes for shared memory
+
+    # Allocate dynamic shared memory
+    shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
+    offset += sizeof(eltype(du)) * tile_width^2
+    shmem_vflux = @cuDynamicSharedMem(eltype(du),
+                                      (size(du, 1), tile_width, tile_width, tile_width, 2),
+                                      offset)
+
+    # Get thread and block indices only we need save registers
+    tx, ty = threadIdx().x, threadIdx().y
+
+    # We launch one block in y direction so j = ty
+    ty1 = div(ty - 1, tile_width^2) + 1 # same as j1
+    ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j2
+    ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j3
+
+    # We launch one block in x direction so i = tx
+    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    # j1 = div(j - 1, u2^2) + 1
+    # j2 = div(rem(j - 1, u2^2), u2) + 1
+    # j3 = rem(rem(j - 1, u2^2), u2) + 1
+
+    # Tile the computation (restrict to one tile here)
+    value = zero(eltype(du))
+
+    # Load global `derivative_split` into shared memory
+    # Transposed memory access or not?
+    @inbounds begin
+        shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
+    end
+
+    # Load global `volume_flux_arr1` and `volume_flux_arr2` into shared memory, 
+    # and note that they are removed now for smaller GPU memory allocation
+    u_node = get_node_vars(u, equations, ty1, ty2, k)
+    u_node1 = get_node_vars(u, equations, ty3, ty2, k)
+    u_node2 = get_node_vars(u, equations, ty1, ty3, k)
+
+    volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations)
+    volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations)
+
+    @inbounds begin
+        shmem_vflux[tx, ty1, ty3, ty2, 1] = volume_flux_node1[tx]
+        shmem_vflux[tx, ty1, ty2, ty3, 2] = volume_flux_node2[tx]
+    end
+
+    sync_threads()
+
+    # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts and parallelize (ty1, ty2) with threads to ty3, 
+    # then consolidate each computation back to (ty1, ty2)
+    # How to replace shared memory `shmem_flux` with `flux_node`?
+    for thread in 1:tile_width
+        @inbounds begin
+            value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread, ty2, 1]
+            value += shmem_split[thread, ty2] * shmem_vflux[tx, ty1, ty2, thread, 2]
+        end
+    end
+
+    # Synchronization is not needed here if we use only one tile
+    # sync_threads()
+
+    # Finalize the weak form
+    @inbounds du[tx, ty1, ty2, k] = value
+
+    return nothing
+end
+
 # Kernel for calculating symmetric and nonconservative fluxes
 function symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1,
                                         noncons_flux_arr2, u, derivative_split,
@@ -202,30 +295,6 @@ function symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2
     return nothing
 end
 
-# Kernel for calculating volume integrals
-function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
-                                 equations::AbstractEquations{2})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4))
-        j1 = div(j - 1, size(du, 2)) + 1
-        j2 = rem(j - 1, size(du, 2)) + 1
-
-        @inbounds du[i, j1, j2, k] = zero(eltype(du)) # fuse `reset_du!` here
-
-        for ii in axes(du, 2)
-            @inbounds begin
-                du[i, j1, j2, k] += derivative_split[j1, ii] * volume_flux_arr1[i, j1, ii, j2, k]
-                du[i, j1, j2, k] += derivative_split[j2, ii] * volume_flux_arr2[i, j1, j2, ii, k]
-            end
-        end
-    end
-
-    return nothing
-end
-
 # Kernel for calculating symmetric and nonconservative volume integrals
 function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symmetric_flux_arr2,
                                  noncons_flux_arr1, noncons_flux_arr2)
@@ -1129,19 +1198,18 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms,
 
     # The maximum number of threads per block is the dominant factor when choosing the optimization 
     # method. However, there are other factors that may cause a launch failure, such as the maximum 
-    # number of registers per block. Here, we have omitted all other factors, but this should be 
-    # enhanced later for a safer kernel launch.
+    # shared memory per block. Here, we have omitted all other factors, but this should be enhanced 
+    # later for a safer kernel launch.
     # TODO: More checks before the kernel launch
-    thread_num_per_block = size(du, 1) * size(du, 2)^2
-    if thread_num_per_block > MAX_THREADS_PER_BLOCK
+    thread_per_block = size(du, 1) * size(du, 2)^2
+    if thread_per_block > MAX_THREADS_PER_BLOCK
         # TODO: How to optimize when size is large
         flux_arr1 = similar(u)
         flux_arr2 = similar(u)
 
         flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, u, equations, flux)
         flux_kernel(flux_arr1, flux_arr2, u, equations, flux;
-                    kernel_configurator_3d(flux_kernel, size(u, 1), size(u, 2)^2,
-                                           size(u, 4))...)
+                    kernel_configurator_2d(flux_kernel, size(u, 2)^2, size(u, 4))...)
 
         weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1,
                                                                 flux_arr2)
@@ -1168,27 +1236,40 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
     RealT = eltype(du)
 
     volume_flux = volume_integral.volume_flux
-
     derivative_split = dg.basis.derivative_split
-    set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!`
-
+    # TODO: Move `set_diagonal_to_zero!` outside of `rhs!` loop and cache the result in 
+    # DG struct on GPU 
+    set_diagonal_to_zero!(derivative_split)
     derivative_split = CuArray(derivative_split)
-    volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
-                                      size(u, 4))
-    volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
-                                      size(u, 4))
 
-    volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2,
-                                                                u, equations, volume_flux)
-    volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux;
-                       kernel_configurator_2d(volume_flux_kernel, size(u, 2)^3, size(u, 4))...)
-
-    volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
-                                                                        volume_flux_arr1,
-                                                                        volume_flux_arr2, equations)
-    volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2, equations;
-                           kernel_configurator_3d(volume_integral_kernel, size(du, 1),
-                                                  size(du, 2)^2, size(du, 4))...)
+    thread_per_block = size(du, 1) * size(du, 2)^3
+    if thread_per_block > MAX_THREADS_PER_BLOCK
+        # TODO: How to optimize when size is large
+        volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
+                                          size(u, 4))
+        volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
+                                          size(u, 4))
+
+        volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2,
+                                                                    u, equations, volume_flux)
+        volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux;
+                           kernel_configurator_2d(volume_flux_kernel, size(u, 2)^3, size(u, 4))...)
+
+        volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
+                                                                            volume_flux_arr1,
+                                                                            volume_flux_arr2, equations)
+        volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2, equations;
+                               kernel_configurator_3d(volume_integral_kernel, size(du, 1),
+                                                      size(du, 2)^2, size(du, 4))...)
+    else
+        shmem_size = (size(du, 2)^2 + size(du, 1) * 2 * size(du, 2)^3) * sizeof(eltype(du))
+        volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
+                                                                                      equations, volume_flux)
+        volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
+                                    shmem = shmem_size,
+                                    threads = (size(du, 1), size(du, 2)^3, 1),
+                                    blocks = (1, 1, size(du, 4)))
+    end
 
     return nothing
 end
diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl
index 5d4c2580..04e501a2 100644
--- a/src/solvers/dg_3d.jl
+++ b/src/solvers/dg_3d.jl
@@ -7,11 +7,10 @@
 # Kernel for calculating fluxes along normal directions
 function flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations::AbstractEquations{3},
                       flux::Any)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+    j = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
 
-    if (i <= size(u, 1) && j <= size(u, 2)^3 && k <= size(u, 5))
+    if (j <= size(u, 2)^3 && k <= size(u, 5))
         u2 = size(u, 2)
 
         j1 = div(j - 1, u2^2) + 1
@@ -24,10 +23,12 @@ function flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations::AbstractEqu
         flux_node2 = flux(u_node, 2, equations)
         flux_node3 = flux(u_node, 3, equations)
 
-        @inbounds begin
-            flux_arr1[i, j1, j2, j3, k] = flux_node1[i]
-            flux_arr2[i, j1, j2, j3, k] = flux_node2[i]
-            flux_arr3[i, j1, j2, j3, k] = flux_node3[i]
+        for ii in axes(u, 1)
+            @inbounds begin
+                flux_arr1[ii, j1, j2, j3, k] = flux_node1[ii]
+                flux_arr2[ii, j1, j2, j3, k] = flux_node2[ii]
+                flux_arr3[ii, j1, j2, j3, k] = flux_node3[ii]
+            end
         end
     end
 
@@ -61,8 +62,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3)
     return nothing
 end
 
-# Kernel for calculating flux and weak form
-# It is optimized using shared memory access and computation tiling
+# Kernel for calculating fluxes and weak form
+# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{3}, flux::Any)
     # Set tile width
@@ -73,13 +74,10 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     shmem_dhat = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
     offset += sizeof(eltype(du)) * tile_width^2
     shmem_flux = @cuDynamicSharedMem(eltype(du),
-                                     (size(du, 1), tile_width, tile_width, tile_width, 3), offset)
+                                     (size(du, 1), tile_width, tile_width, tile_width, 3),
+                                     offset)
 
-    # Get thread and block indices
-    # bx, by, bz = blockIdx().x, blockIdx().y, blockIdx().z
-    # tx, ty, tz = threadIdx().x, threadIdx().y, threadIdx().z
-
-    # Get thread and block indices we need save registers
+    # Get thread and block indices only we need save registers
     tx, ty = threadIdx().x, threadIdx().y
 
     # We launch one block in y direction so j = ty
@@ -87,7 +85,7 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j2
     ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j3
 
-    # We laucn one block in x direction so i = tx
+    # We launch one block in x direction so i = tx
     # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
@@ -100,28 +98,18 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     value = zero(eltype(du))
 
     # Load global `derivative_dhat` into shared memory
-    # Note the memory access pattern matters here, transposed or not needs to be 
-    # considered for better performance
-    # TODO: Better memory access pattern
+    # Transposed memory access or not?
     @inbounds begin
         shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
     end
 
-    # Load global `flux_arr1` into shared memory
-    # Load global `flux_arr2` into shared memory
-    # Load global `flux_arr3` into shared memory
-    # Note that `flux_arr1`, `flux_arr2` and `flux_arr3` are removed for smaller 
-    # GPU memory allocation
+    # Load global `flux_arr1`, `flux_arr2`, and `flux_arr3` into shared memory,
+    # and note that they are removed for smaller GPU memory allocation
     u_node = get_node_vars(u, equations, ty1, ty2, ty3, k)
     flux_node1 = flux(u_node, 1, equations)
     flux_node2 = flux(u_node, 2, equations)
     flux_node3 = flux(u_node, 3, equations)
-    # @inbounds begin
-    #     shmem_flux[tx, ty1, ty2, ty3, 1] = flux_arr1[tx, ty1, ty2, ty3, k]
-    #     shmem_flux[tx, ty1, ty2, ty3, 2] = flux_arr2[tx, ty1, ty2, ty3, k]
-    #     shmem_flux[tx, ty1, ty2, ty3, 3] = flux_arr3[tx, ty1, ty2, ty3, k]
-    # end
-    # TODO: Better memory access pattern
+
     @inbounds begin
         shmem_flux[tx, ty1, ty2, ty3, 1] = flux_node1[tx]
         shmem_flux[tx, ty1, ty2, ty3, 2] = flux_node2[tx]
@@ -131,6 +119,8 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
     sync_threads()
 
     # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts
+    # How to replace shared memory `shmem_flux` with `flux_node`?
     for thread in 1:tile_width
         @inbounds begin
             value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, ty3, 1]
@@ -183,6 +173,115 @@ function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr
     return nothing
 end
 
+# Kernel for calculating volume integrals
+function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
+                                 volume_flux_arr3, equations::AbstractEquations{3})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5))
+        u2 = size(du, 2) # size(du, 2) == size(u, 2)
+
+        j1 = div(j - 1, u2^2) + 1
+        j2 = div(rem(j - 1, u2^2), u2) + 1
+        j3 = rem(rem(j - 1, u2^2), u2) + 1
+
+        @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # fuse `reset_du!` here
+
+        for ii in axes(du, 2)
+            @inbounds begin
+                du[i, j1, j2, j3, k] += derivative_split[j1, ii] *
+                                        volume_flux_arr1[i, j1, ii, j2, j3, k]
+                du[i, j1, j2, j3, k] += derivative_split[j2, ii] *
+                                        volume_flux_arr2[i, j1, j2, ii, j3, k]
+                du[i, j1, j2, j3, k] += derivative_split[j3, ii] *
+                                        volume_flux_arr3[i, j1, j2, j3, ii, k]
+            end
+        end
+    end
+
+    return nothing
+end
+
+# Kernel for calculating volume fluxes and volume integrals
+# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
+function volume_flux_integral_kernel!(du, u, derivative_split,
+                                      equations::AbstractEquations{3}, volume_flux::Any)
+    # Set tile width
+    tile_width = size(du, 2)
+    offset = 0 # offset bytes for shared memory
+
+    # Allocate dynamic shared memory
+    shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
+    offset += sizeof(eltype(du)) * tile_width^2
+    shmem_vflux = @cuDynamicSharedMem(eltype(du),
+                                      (size(du, 1), tile_width, tile_width, tile_width, tile_width, 3),
+                                      offset)
+
+    # Get thread and block indices only we need save registers
+    tx, ty = threadIdx().x, threadIdx().y
+
+    # We launch one block in y direction so j = ty
+    ty1 = div(ty - 1, tile_width^3) + 1
+    ty2 = div(rem(ty - 1, tile_width^3), tile_width^2) + 1
+    ty3 = div(rem(ty - 1, tile_width^2), tile_width) + 1
+    ty4 = rem(ty - 1, tile_width) + 1
+
+    # We launch one block in x direction so i = tx
+    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
+
+    # Tile the computation (restrict to one tile here)
+    value = zero(eltype(du))
+
+    # Load global `derivative_split` into shared memory
+    # Transposed memory access or not?
+    @inbounds begin
+        shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
+    end
+
+    # Load global `volume_flux_arr1`, `volume_flux_arr2`, and `volume_flux_arr3` 
+    # into shared memory, and note that they are removed for smaller GPU memory allocation
+    u_node = get_node_vars(u, equations, ty1, ty2, ty3, k)
+    u_node1 = get_node_vars(u, equations, ty4, ty2, ty3, k)
+    u_node2 = get_node_vars(u, equations, ty1, ty4, ty3, k)
+    u_node3 = get_node_vars(u, equations, ty1, ty2, ty4, k)
+
+    volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations)
+    volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations)
+    volume_flux_node3 = volume_flux(u_node, u_node3, 3, equations)
+
+    @inbounds begin
+        shmem_vflux[tx, ty1, ty4, ty2, ty3, 1] = volume_flux_node1[tx]
+        shmem_vflux[tx, ty1, ty2, ty4, ty3, 2] = volume_flux_node2[tx]
+        shmem_vflux[tx, ty1, ty2, ty3, ty4, 3] = volume_flux_node3[tx]
+    end
+
+    sync_threads()
+
+    # Loop within one block to get weak form
+    # TODO: Avoid potential bank conflicts and parallelize (ty1, ty2, ty3) with threads 
+    # to ty4, then consolidate each computation back to (ty1, ty2, ty3)
+    # How to replace shared memory `shmem_flux` with `flux_node`?
+    for thread in 1:tile_width
+        @inbounds begin
+            value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread, ty2, ty3, 1]
+            value += shmem_split[thread, ty2] * shmem_vflux[tx, ty1, ty2, thread, ty3, 2]
+            value += shmem_split[thread, ty3] * shmem_vflux[tx, ty1, ty2, ty3, thread, 3]
+        end
+    end
+
+    # Synchronization is not needed here if we use only one tile
+    # sync_threads()
+
+    # Finalize the weak form
+    @inbounds du[tx, ty1, ty2, ty3, k] = value
+
+    return nothing
+end
+
 # Kernel for calculating symmetric and nonconservative fluxes
 function symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3,
                                         noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3,
@@ -231,37 +330,6 @@ function symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2
     return nothing
 end
 
-# Kernel for calculating volume integrals
-function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
-                                 volume_flux_arr3, equations::AbstractEquations{3})
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
-    k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
-
-    if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5))
-        u2 = size(du, 2) # size(du, 2) == size(u, 2)
-
-        j1 = div(j - 1, u2^2) + 1
-        j2 = div(rem(j - 1, u2^2), u2) + 1
-        j3 = rem(rem(j - 1, u2^2), u2) + 1
-
-        @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # fuse `reset_du!` here
-
-        for ii in axes(du, 2)
-            @inbounds begin
-                du[i, j1, j2, j3, k] += derivative_split[j1, ii] *
-                                        volume_flux_arr1[i, j1, ii, j2, j3, k]
-                du[i, j1, j2, j3, k] += derivative_split[j2, ii] *
-                                        volume_flux_arr2[i, j1, j2, ii, j3, k]
-                du[i, j1, j2, j3, k] += derivative_split[j3, ii] *
-                                        volume_flux_arr3[i, j1, j2, j3, ii, k]
-            end
-        end
-    end
-
-    return nothing
-end
-
 # Kernel for calculating symmetric and nonconservative volume integrals
 function volume_integral_kernel!(du, derivative_split,
                                  symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3,
@@ -1709,11 +1777,11 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms,
 
     # The maximum number of threads per block is the dominant factor when choosing the optimization 
     # method. However, there are other factors that may cause a launch failure, such as the maximum 
-    # number of registers per block. Here, we have omitted all other factors, but this should be 
-    # enhanced later for a safer kernel launch.
+    # shared memory per block. Here, we have omitted all other factors, but this should be enhanced 
+    # later for a safer kernel launch.
     # TODO: More checks before the kernel launch
-    thread_num_per_block = size(du, 1) * size(du, 2)^3
-    if thread_num_per_block > MAX_THREADS_PER_BLOCK
+    thread_per_block = size(du, 1) * size(du, 2)^3
+    if thread_per_block > MAX_THREADS_PER_BLOCK
         # TODO: How to optimize when size is large
         flux_arr1 = similar(u)
         flux_arr2 = similar(u)
@@ -1722,8 +1790,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms,
         flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations,
                                                       flux)
         flux_kernel(flux_arr1, flux_arr2, flux_arr3, u, equations, flux;
-                    kernel_configurator_3d(flux_kernel, size(u, 1), size(u, 2)^3,
-                                           size(u, 5))...)
+                    kernel_configurator_2d(flux_kernel, size(u, 2)^3, size(u, 5))...)
 
         weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1,
                                                                 flux_arr2, flux_arr3)
@@ -1752,31 +1819,45 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
     volume_flux = volume_integral.volume_flux
 
     derivative_split = dg.basis.derivative_split
-    set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!`
-
+    # TODO: Move `set_diagonal_to_zero!` outside of `rhs!` loop and cache the result in 
+    # DG struct on GPU 
+    set_diagonal_to_zero!(derivative_split)
     derivative_split = CuArray(derivative_split)
-    volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
-                                      size(u, 2), size(u, 5))
-    volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
-                                      size(u, 2), size(u, 5))
-    volume_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
-                                      size(u, 2), size(u, 5))
-
-    volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2,
-                                                                volume_flux_arr3, u, equations,
-                                                                volume_flux)
-    volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations,
-                       volume_flux;
-                       kernel_configurator_2d(volume_flux_kernel, size(u, 2)^4, size(u, 5))...)
 
-    volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
-                                                                        volume_flux_arr1,
-                                                                        volume_flux_arr2,
-                                                                        volume_flux_arr3, equations)
-    volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
-                           volume_flux_arr3, equations;
-                           kernel_configurator_3d(volume_integral_kernel, size(du, 1),
-                                                  size(du, 2)^3, size(du, 5))...)
+    thread_per_block = size(du, 1) * size(du, 2)^4
+    if thread_per_block > MAX_THREADS_PER_BLOCK
+        # TODO: How to optimize when size is large
+        volume_flux_arr1 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
+                                          size(u, 2), size(u, 5))
+        volume_flux_arr2 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
+                                          size(u, 2), size(u, 5))
+        volume_flux_arr3 = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2),
+                                          size(u, 2), size(u, 5))
+
+        volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2,
+                                                                    volume_flux_arr3, u, equations,
+                                                                    volume_flux)
+        volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations,
+                           volume_flux;
+                           kernel_configurator_2d(volume_flux_kernel, size(u, 2)^4, size(u, 5))...)
+
+        volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split,
+                                                                            volume_flux_arr1,
+                                                                            volume_flux_arr2,
+                                                                            volume_flux_arr3, equations)
+        volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2,
+                               volume_flux_arr3, equations;
+                               kernel_configurator_3d(volume_integral_kernel, size(du, 1),
+                                                      size(du, 2)^3, size(du, 5))...)
+    else
+        shmem_size = (size(du, 2)^2 + size(du, 1) * 3 * size(du, 2)^4) * sizeof(eltype(du))
+        volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
+                                                                                      equations, volume_flux)
+        volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
+                                    shmem = shmem_size,
+                                    threads = (size(du, 1), size(du, 2)^4, 1),
+                                    blocks = (1, 1, size(du, 5)))
+    end
 
     return nothing
 end
diff --git a/test/tree_dgsem_2d/euler_ec.jl b/test/tree_dgsem_2d/euler_ec.jl
new file mode 100644
index 00000000..81fdc685
--- /dev/null
+++ b/test/tree_dgsem_2d/euler_ec.jl
@@ -0,0 +1,130 @@
+module TestEulerEC2D
+
+using Trixi, TrixiCUDA
+using Test
+
+include("../test_macros.jl")
+
+@testset "Euler Shock 2D" begin
+    equations = CompressibleEulerEquations2D(1.4)
+
+    initial_condition = initial_condition_weak_blast_wave
+
+    volume_flux = flux_ranocha
+    solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha,
+                   volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+    coordinates_min = (-2.0, -2.0)
+    coordinates_max = (2.0, 2.0)
+    mesh = TreeMesh(coordinates_min, coordinates_max,
+                    initial_refinement_level = 5,
+                    n_cells_max = 10_000,
+                    periodicity = true)
+
+    semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                        boundary_conditions = boundary_condition_periodic)
+    semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver,
+                                               boundary_conditions = boundary_condition_periodic)
+
+    tspan = tspan_gpu = (0.0, 0.4)
+    t = t_gpu = 0.0
+
+    # Semi on CPU
+    (; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi
+
+    # Semi on GPU
+    equations_gpu = semi_gpu.equations
+    mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache
+    boundary_conditions_gpu = semi_gpu.boundary_conditions
+    source_terms_gpu = semi_gpu.source_terms
+
+    # ODE on CPU
+    ode = semidiscretize(semi, tspan)
+    u_ode = copy(ode.u0)
+    du_ode = similar(u_ode)
+    u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache)
+    du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache)
+
+    # ODE on GPU
+    ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu)
+    u_gpu = copy(ode_gpu.u0)
+    du_gpu = similar(u_gpu)
+
+    # Tests for components initialization
+    @test_approx (u_gpu, u)
+    # du is initlaizaed as undefined, cannot test now
+
+    # Tests for semidiscretization process
+    # TrixiCUDA.reset_du!(du_gpu)
+    # Trixi.reset_du!(du, solver, cache)
+    # @test_approx (du_gpu, du)
+
+    Trixi.reset_du!(du, solver, cache)
+
+    TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu,
+                                    Trixi.have_nonconservative_terms(equations_gpu),
+                                    equations_gpu, solver_gpu.volume_integral, solver_gpu,
+                                    cache_gpu)
+    Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations),
+                                equations, solver.volume_integral, solver, cache)
+    @test_approx (du_gpu, du)
+
+    TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu)
+    Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver)
+    @test_approx (cache_gpu.interfaces.u, cache.interfaces.u)
+
+    TrixiCUDA.cuda_interface_flux!(mesh_gpu,
+                                   Trixi.have_nonconservative_terms(equations_gpu),
+                                   equations_gpu, solver_gpu, cache_gpu)
+    Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+                               Trixi.have_nonconservative_terms(equations), equations,
+                               solver.surface_integral, solver, cache)
+    @test_approx (cache_gpu.elements.surface_flux_values,
+                  cache.elements.surface_flux_values)
+
+    TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu,
+                                       equations_gpu, cache_gpu)
+    Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver)
+    @test_approx (cache_gpu.boundaries.u, cache.boundaries.u)
+
+    TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu,
+                                  Trixi.have_nonconservative_terms(equations_gpu),
+                                  equations_gpu, solver_gpu, cache_gpu)
+    Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+                              solver.surface_integral, solver)
+    @test_approx (cache_gpu.elements.surface_flux_values,
+                  cache.elements.surface_flux_values)
+
+    TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu,
+                                    TrixiCUDA.check_cache_mortars(cache_gpu),
+                                    solver_gpu, cache_gpu)
+    Trixi.prolong2mortars!(cache, u, mesh, equations, solver.mortar,
+                           solver.surface_integral, solver)
+    @test_approx (cache_gpu.mortars.u_upper, cache.mortars.u_upper)
+    @test_approx (cache_gpu.mortars.u_lower, cache.mortars.u_lower)
+
+    TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu),
+                                Trixi.have_nonconservative_terms(equations_gpu),
+                                equations_gpu, solver_gpu, cache_gpu)
+    Trixi.calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                            Trixi.have_nonconservative_terms(equations), equations,
+                            solver.mortar, solver.surface_integral, solver, cache)
+    @test_approx (cache_gpu.elements.surface_flux_values,
+                  cache.elements.surface_flux_values)
+
+    TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu)
+    Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral,
+                                 solver, cache)
+    @test_approx (du_gpu, du)
+
+    TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu)
+    Trixi.apply_jacobian!(du, mesh, equations, solver, cache)
+    @test_approx (du_gpu, du)
+
+    TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu,
+                            equations_gpu, cache_gpu)
+    Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)
+    @test_approx (du_gpu, du)
+end
+
+end # module
diff --git a/test/tree_dgsem_2d/tree_dgsem_2d.jl b/test/tree_dgsem_2d/tree_dgsem_2d.jl
index 8eaaa2d5..b8867b46 100644
--- a/test/tree_dgsem_2d/tree_dgsem_2d.jl
+++ b/test/tree_dgsem_2d/tree_dgsem_2d.jl
@@ -1,6 +1,7 @@
 include("advection_basic.jl")
 include("advection_mortar.jl")
 include("euler_blob_mortar.jl")
+include("euler_ec.jl")
 include("euler_shock.jl")
 include("euler_source_terms_nonperiodic.jl")
 include("euler_source_terms.jl")