diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index fd501a7257c..85880380c61 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -376,6 +376,11 @@ function analyze(::Val{:linf_divb}, du, u, t,
         end
     end
 
+    if mpi_isparallel()
+        # Base.max instead of max needed, see comment in src/auxiliary/math.jl
+        linf_divb = MPI.Allreduce!(Ref(linf_divb), Base.max, mpi_comm())[]
+    end
+
     return linf_divb
 end
 
@@ -412,6 +417,11 @@ function analyze(::Val{:linf_divb}, du, u, t,
         end
     end
 
+    if mpi_isparallel()
+        # Base.max instead of max needed, see comment in src/auxiliary/math.jl
+        linf_divb = MPI.Allreduce!(Ref(linf_divb), Base.max, mpi_comm())[]
+    end
+
     return linf_divb
 end
 end # @muladd
diff --git a/src/callbacks_step/analysis_dg3d_parallel.jl b/src/callbacks_step/analysis_dg3d_parallel.jl
index 70a616367cd..285f0f62de6 100644
--- a/src/callbacks_step/analysis_dg3d_parallel.jl
+++ b/src/callbacks_step/analysis_dg3d_parallel.jl
@@ -72,10 +72,12 @@ function integrate_via_indices(func::Func, u,
     @unpack weights = dg.basis
 
     # Initialize integral with zeros of the right shape
-    # Pass `zero(SVector{nvariables(equations), eltype(u))}` to `func` since `u` might be empty, if the
-    # current rank has no elements, see also https://github.com/trixi-framework/Trixi.jl/issues/1096.
-    integral = zero(func(zero(SVector{nvariables(equations), eltype(u)}), 1, 1, 1, 1,
-                         equations, dg, args...))
+    # Pass `zeros(eltype(u), nvariables(equations), nnodes(dg), nnodes(dg), nnodes(dg), 1)` 
+    # to `func` since `u` might be empty, if the current rank has no elements. 
+    # See also https://github.com/trixi-framework/Trixi.jl/issues/1096, and
+    # https://github.com/trixi-framework/Trixi.jl/pull/2126/files/7cbc57cfcba93e67353566e10fce1f3edac27330#r1814483243.
+    integral = zero(func(zeros(eltype(u), nvariables(equations), nnodes(dg), nnodes(dg),
+                               nnodes(dg), 1), 1, 1, 1, 1, equations, dg, args...))
     volume = zero(real(mesh))
 
     # Use quadrature to numerically integrate over entire domain
diff --git a/src/callbacks_step/glm_speed_dg.jl b/src/callbacks_step/glm_speed_dg.jl
index 302aae356ab..4d9e4eee23f 100644
--- a/src/callbacks_step/glm_speed_dg.jl
+++ b/src/callbacks_step/glm_speed_dg.jl
@@ -13,6 +13,14 @@ function calc_dt_for_cleaning_speed(cfl::Real, mesh,
     # Cartesian meshes
     max_scaled_speed_for_c_h = maximum(cache.elements.inverse_jacobian) *
                                ndims(equations)
+
+    if mpi_isparallel()
+        # Base.max instead of max needed, see comment in src/auxiliary/math.jl
+        max_scaled_speed_for_c_h = MPI.Allreduce!(Ref(max_scaled_speed_for_c_h),
+                                                  Base.max,
+                                                  mpi_comm())[]
+    end
+
     # OBS! This depends on the implementation details of the StepsizeCallback and needs to be adapted
     #      as well if that callback changes.
     return cfl * 2 / (nnodes(dg) * max_scaled_speed_for_c_h)
@@ -29,6 +37,12 @@ function calc_dt_for_cleaning_speed(cfl::Real, mesh,
     # Copies implementation behavior of `calc_dt_for_cleaning_speed` for DGSEM discretizations.
     max_scaled_speed_for_c_h = inv(minimum(md.J)) * ndims(equations)
 
+    if mpi_isparallel()
+        # Base.max instead of max needed, see comment in src/auxiliary/math.jl
+        max_scaled_speed_for_c_h = MPI.Allreduce!(Ref(max_scaled_speed_for_c_h),
+                                                  Base.max,
+                                                  mpi_comm())[]
+    end
     # This mimics `max_dt` for `TreeMesh`, except that `nnodes(dg)` is replaced by
     # `polydeg+1`. This is because `nnodes(dg)` returns the total number of
     # multi-dimensional nodes for DGMulti solver types, while `nnodes(dg)` returns
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index 3daca10e821..e51836e8a51 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -267,6 +267,40 @@ end
     end
 end
 
+# Inlined version of the interface flux computation for non-conservative equations
+@inline function calc_mpi_interface_flux!(surface_flux_values,
+                                          mesh::Union{ParallelP4estMesh{3},
+                                                      ParallelT8codeMesh{3}},
+                                          nonconservative_terms::True, equations,
+                                          surface_integral, dg::DG, cache,
+                                          interface_index, normal_direction,
+                                          interface_i_node_index,
+                                          interface_j_node_index, local_side,
+                                          surface_i_node_index, surface_j_node_index,
+                                          local_direction_index, local_element_index)
+    @unpack u = cache.mpi_interfaces
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg,
+                                       interface_i_node_index, interface_j_node_index,
+                                       interface_index)
+
+    # Compute flux and non-conservative term for this side of the interface
+    if local_side == 1
+        flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
+        noncons_flux_ = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
+    else # local_side == 2
+        flux_ = -surface_flux(u_ll, u_rr, -normal_direction, equations)
+        noncons_flux_ = -nonconservative_flux(u_rr, u_ll, -normal_direction, equations)
+    end
+
+    for v in eachvariable(equations)
+        surface_flux_values[v, surface_i_node_index, surface_j_node_index,
+        local_direction_index, local_element_index] = flux_[v] +
+                                                      0.5f0 * noncons_flux_[v]
+    end
+end
+
 function prolong2mpimortars!(cache, u,
                              mesh::Union{ParallelP4estMesh{3}, ParallelT8codeMesh{3}},
                              equations,
@@ -384,12 +418,13 @@ function calc_mpi_mortar_flux!(surface_flux_values,
                                surface_integral, dg::DG, cache)
     @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
     @unpack contravariant_vectors = cache.elements
-    @unpack fstar_primary_threaded, fstar_tmp_threaded = cache
+    @unpack fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded = cache
     index_range = eachnode(dg)
 
     @threaded for mortar in eachmpimortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar = fstar_primary_threaded[Threads.threadid()]
+        fstar_primary = fstar_primary_threaded[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
         fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
 
         # Get index information on the small elements
@@ -412,7 +447,8 @@ function calc_mpi_mortar_flux!(surface_flux_values,
                     normal_direction = get_normal_direction(cache.mpi_mortars, i, j,
                                                             position, mortar)
 
-                    calc_mpi_mortar_flux!(fstar, mesh, nonconservative_terms, equations,
+                    calc_mpi_mortar_flux!(fstar_primary, fstar_secondary, mesh,
+                                          nonconservative_terms, equations,
                                           surface_integral, dg, cache,
                                           mortar, position, normal_direction,
                                           i, j)
@@ -433,14 +469,15 @@ function calc_mpi_mortar_flux!(surface_flux_values,
 
         mpi_mortar_fluxes_to_elements!(surface_flux_values,
                                        mesh, equations, mortar_l2, dg, cache,
-                                       mortar, fstar, u_buffer, fstar_tmp)
+                                       mortar, fstar_primary, fstar_secondary, u_buffer,
+                                       fstar_tmp)
     end
 
     return nothing
 end
 
 # Inlined version of the mortar flux computation on small elements for conservation laws
-@inline function calc_mpi_mortar_flux!(fstar,
+@inline function calc_mpi_mortar_flux!(fstar_primary, fstar_secondary,
                                        mesh::Union{ParallelP4estMesh{3},
                                                    ParallelT8codeMesh{3}},
                                        nonconservative_terms::False, equations,
@@ -456,16 +493,48 @@ end
     flux = surface_flux(u_ll, u_rr, normal_direction, equations)
 
     # Copy flux to buffer
-    set_node_vars!(fstar, flux, equations, dg, i_node_index, j_node_index,
+    set_node_vars!(fstar_primary, flux, equations, dg, i_node_index, j_node_index,
+                   position_index)
+    set_node_vars!(fstar_secondary, flux, equations, dg, i_node_index, j_node_index,
                    position_index)
 end
 
+# Inlined version of the mortar flux computation on small elements for non-conservative equations
+@inline function calc_mpi_mortar_flux!(fstar_primary, fstar_secondary,
+                                       mesh::Union{ParallelP4estMesh{3},
+                                                   ParallelT8codeMesh{3}},
+                                       nonconservative_terms::True, equations,
+                                       surface_integral, dg::DG, cache,
+                                       mortar_index, position_index, normal_direction,
+                                       i_node_index, j_node_index)
+    @unpack u = cache.mpi_mortars
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg, position_index, i_node_index,
+                                       j_node_index, mortar_index)
+
+    flux = surface_flux(u_ll, u_rr, normal_direction, equations)
+    noncons_flux_primary = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
+    noncons_flux_secondary = nonconservative_flux(u_rr, u_ll, normal_direction,
+                                                  equations)
+
+    for v in eachvariable(equations)
+        fstar_primary[v, i_node_index, j_node_index, position_index] = flux[v] +
+                                                                       0.5f0 *
+                                                                       noncons_flux_primary[v]
+        fstar_secondary[v, i_node_index, j_node_index, position_index] = flux[v] +
+                                                                         0.5f0 *
+                                                                         noncons_flux_secondary[v]
+    end
+end
+
 @inline function mpi_mortar_fluxes_to_elements!(surface_flux_values,
                                                 mesh::Union{ParallelP4estMesh{3},
                                                             ParallelT8codeMesh{3}},
                                                 equations,
                                                 mortar_l2::LobattoLegendreMortarL2,
-                                                dg::DGSEM, cache, mortar, fstar,
+                                                dg::DGSEM, cache, mortar, fstar_primary,
+                                                fstar_secondary,
                                                 u_buffer, fstar_tmp)
     @unpack local_neighbor_ids, local_neighbor_positions, node_indices = cache.mpi_mortars
     index_range = eachnode(dg)
@@ -487,22 +556,22 @@ end
             # Project small fluxes to large element.
             multiply_dimensionwise!(u_buffer,
                                     mortar_l2.reverse_lower, mortar_l2.reverse_lower,
-                                    view(fstar, .., 1),
+                                    view(fstar_secondary, .., 1),
                                     fstar_tmp)
             add_multiply_dimensionwise!(u_buffer,
                                         mortar_l2.reverse_upper,
                                         mortar_l2.reverse_lower,
-                                        view(fstar, .., 2),
+                                        view(fstar_secondary, .., 2),
                                         fstar_tmp)
             add_multiply_dimensionwise!(u_buffer,
                                         mortar_l2.reverse_lower,
                                         mortar_l2.reverse_upper,
-                                        view(fstar, .., 3),
+                                        view(fstar_secondary, .., 3),
                                         fstar_tmp)
             add_multiply_dimensionwise!(u_buffer,
                                         mortar_l2.reverse_upper,
                                         mortar_l2.reverse_upper,
-                                        view(fstar, .., 4),
+                                        view(fstar_secondary, .., 4),
                                         fstar_tmp)
             # The flux is calculated in the outward direction of the small elements,
             # so the sign must be switched to get the flux in outward direction
@@ -536,10 +605,10 @@ end
             for j in eachnode(dg)
                 for i in eachnode(dg)
                     for v in eachvariable(equations)
-                        surface_flux_values[v, i, j, small_direction, element] = fstar[v,
-                                                                                       i,
-                                                                                       j,
-                                                                                       position]
+                        surface_flux_values[v, i, j, small_direction, element] = fstar_primary[v,
+                                                                                               i,
+                                                                                               j,
+                                                                                               position]
                     end
                 end
             end
diff --git a/test/test_mpi_p4est_3d.jl b/test/test_mpi_p4est_3d.jl
index d6018e51c81..663732b4ded 100644
--- a/test/test_mpi_p4est_3d.jl
+++ b/test/test_mpi_p4est_3d.jl
@@ -235,6 +235,80 @@ const EXAMPLES_DIR = pkgdir(Trixi, "examples", "p4est_3d_dgsem")
             @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
         end
     end
+
+    @trixi_testset "elixir_mhd_alfven_wave_nonconforming.jl" begin
+        @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                     "elixir_mhd_alfven_wave_nonconforming.jl"),
+                            l2=[
+                                0.0001788543743594658,
+                                0.000624334205581902,
+                                0.00022892869974368887,
+                                0.0007223464581156573,
+                                0.0006651366626523314,
+                                0.0006287275014743352,
+                                0.000344484339916008,
+                                0.0007179788287557142,
+                                8.632896980651243e-7
+                            ],
+                            linf=[
+                                0.0010730565632763867,
+                                0.004596749809344033,
+                                0.0013235269262853733,
+                                0.00468874234888117,
+                                0.004719267084104306,
+                                0.004228339352211896,
+                                0.0037503625505571625,
+                                0.005104176909383168,
+                                9.738081186490818e-6
+                            ],
+                            tspan=(0.0, 0.25),
+                            coverage_override=(trees_per_dimension = (1, 1, 1),))
+        # Ensure that we do not have excessive memory allocations
+        # (e.g., from type instabilities)
+        let
+            t = sol.t[end]
+            u_ode = sol.u[end]
+            du_ode = similar(u_ode)
+            @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+        end
+    end
+
+    # Same test as above but with only one tree in the mesh
+    # We use it to test meshes with elements of different size in each partition
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_mhd_alfven_wave_nonconforming.jl"),
+                        l2=[
+                            0.0019054118500017054,
+                            0.006957977226608083,
+                            0.003429930594167365,
+                            0.009051598556176287,
+                            0.0077261662742688425,
+                            0.008210851821439208,
+                            0.003763030674412298,
+                            0.009175470744760567,
+                            2.881690753923244e-5
+                        ],
+                        linf=[
+                            0.010983704624623503,
+                            0.04584128974425262,
+                            0.02022630484954286,
+                            0.04851342295826149,
+                            0.040710154751363525,
+                            0.044722299260292586,
+                            0.036591209423654236,
+                            0.05701669133068068,
+                            0.00024182906501186622
+                        ],
+                        tspan=(0.0, 0.25), trees_per_dimension=(1, 1, 1),
+                        coverage_override=(trees_per_dimension = (1, 1, 1),))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
 end
 end # P4estMesh MPI