diff --git a/benchmark/benchmark_1d.jl b/benchmark/benchmark_1d.jl index f8f4708..1e9e6e3 100644 --- a/benchmark/benchmark_1d.jl +++ b/benchmark/benchmark_1d.jl @@ -2,20 +2,32 @@ using Trixi, TrixiCUDA using CUDA using BenchmarkTools +# Set the precision +RealT = Float32 + # Set up the problem -equations = CompressibleEulerEquations1D(1.4) +equations = CompressibleEulerEquations1D(1.4f0) initial_condition = initial_condition_weak_blast_wave -volume_flux = flux_ranocha -solver = DGSEM(polydeg = 3, surface_flux = flux_ranocha, - volume_integral = VolumeIntegralFluxDifferencing(volume_flux)) - -coordinates_min = (-2.0,) -coordinates_max = (2.0,) +surface_flux = flux_lax_friedrichs +volume_flux = flux_shima_etal +basis = LobattoLegendreBasis(RealT, 3) +indicator_sc = IndicatorHennemannGassner(equations, basis, + alpha_max = 0.5f0, + alpha_min = 0.001f0, + alpha_smooth = true, + variable = density_pressure) +volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; + volume_flux_dg = volume_flux, + volume_flux_fv = surface_flux) +solver = DGSEM(basis, surface_flux, volume_integral) + +coordinates_min = -2.0f0 +coordinates_max = 2.0f0 mesh = TreeMesh(coordinates_min, coordinates_max, initial_refinement_level = 5, - n_cells_max = 10_000) + n_cells_max = 10_000, RealT = RealT) # Cache initialization @info "Time for cache initialization on CPU" @@ -23,8 +35,8 @@ mesh = TreeMesh(coordinates_min, coordinates_max, @info "Time for cache initialization on GPU" CUDA.@time semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) -tspan = tspan_gpu = (0.0, 0.4) -t = t_gpu = 0.0 +tspan = tspan_gpu = (0.0f0, 0.4f0) +t = t_gpu = 0.0f0 # Semi on CPU (; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl index 17d622a..0af7966 100644 --- a/src/semidiscretization/semidiscretization_hyperbolic.jl +++ b/src/semidiscretization/semidiscretization_hyperbolic.jl @@ -4,9 +4,8 @@ function SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver; source_terms = nothing, boundary_conditions = boundary_condition_periodic, - # `RealT` is used as real type for node locations etc. - # while `uEltype` is used as element type of solutions etc. - RealT = real(solver), uEltype = RealT, + RealT = real(solver), # `RealT` is used as real type for node locations etc. + uEltype = RealT, # `uEltype` is used as element type of solutions etc. initial_cache = NamedTuple()) @assert ndims(mesh) == ndims(equations) diff --git a/src/solvers/cache.jl b/src/solvers/cache.jl index ada2ebd..66f707e 100644 --- a/src/solvers/cache.jl +++ b/src/solvers/cache.jl @@ -49,8 +49,8 @@ end function create_cache_gpu(mesh::TreeMesh{1}, equations, volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, uEltype, cache) - fstar1_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nelements(cache.elements)) - fstar1_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nelements(cache.elements)) + fstar1_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nelements(cache.elements)) + fstar1_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nelements(cache.elements)) cache = create_cache_gpu(mesh, equations, VolumeIntegralFluxDifferencing(volume_integral.volume_flux_dg), @@ -103,13 +103,13 @@ end function create_cache_gpu(mesh::TreeMesh{2}, equations, volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, uEltype, cache) - fstar1_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nnodes(dg), + fstar1_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nnodes(dg), nelements(cache.elements)) - fstar1_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nnodes(dg), + fstar1_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nnodes(dg), nelements(cache.elements)) - fstar2_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg) + 1, + fstar2_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg) + 1, nelements(cache.elements)) - fstar2_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg) + 1, + fstar2_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg) + 1, nelements(cache.elements)) cache = create_cache_gpu(mesh, equations, @@ -121,13 +121,13 @@ end function create_cache_gpu(mesh::TreeMesh{2}, equations, mortar_l2::LobattoLegendreMortarL2, uEltype, cache) - fstar_primary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_upper = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_primary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_lower = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_upper = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_upper = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_lower = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_lower = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nmortars(cache.mortars)) (; fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower) @@ -176,17 +176,17 @@ end function create_cache_gpu(mesh::TreeMesh{3}, equations, volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, uEltype, cache) - fstar1_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nnodes(dg), + fstar1_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nnodes(dg), nnodes(dg), nelements(cache.elements)) - fstar1_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg) + 1, nnodes(dg), + fstar1_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg) + 1, nnodes(dg), nnodes(dg), nelements(cache.elements)) - fstar2_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg) + 1, + fstar2_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg) + 1, nnodes(dg), nelements(cache.elements)) - fstar2_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg) + 1, + fstar2_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg) + 1, nnodes(dg), nelements(cache.elements)) - fstar3_L = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg), + fstar3_L = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg), nnodes(dg) + 1, nelements(cache.elements)) - fstar3_R = CUDA.zeros(Float64, nvariables(equations), nnodes(dg), nnodes(dg), + fstar3_R = CUDA.zeros(uEltype, nvariables(equations), nnodes(dg), nnodes(dg), nnodes(dg) + 1, nelements(cache.elements)) cache = create_cache_gpu(mesh, equations, @@ -198,21 +198,21 @@ end function create_cache_gpu(mesh::TreeMesh{3}, equations, mortar_l2::LobattoLegendreMortarL2, uEltype, cache) - fstar_primary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_upper_left = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_primary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_upper_right = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_primary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_lower_left = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_primary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_primary_lower_right = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_upper_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_upper_left = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_upper_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_upper_right = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_lower_left = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_lower_left = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) - fstar_secondary_lower_right = CUDA.zeros(Float64, nvariables(equations), nnodes(mortar_l2), + fstar_secondary_lower_right = CUDA.zeros(uEltype, nvariables(equations), nnodes(mortar_l2), nnodes(mortar_l2), nmortars(cache.mortars)) # Temporary arrays can also be created here diff --git a/src/solvers/common.jl b/src/solvers/common.jl index 47a28c7..9678c8a 100644 --- a/src/solvers/common.jl +++ b/src/solvers/common.jl @@ -1,7 +1,7 @@ # Some common functions that are shared between the solvers. # Copy data from CPU to GPU -function reset_du!(du::CuArray{Float64}) +function reset_du!(du::CuArray) du_zero = zero(du) du .= du_zero # no scalar indexing @@ -9,10 +9,10 @@ function reset_du!(du::CuArray{Float64}) end # Set diagonal entries of a matrix to zeros -function set_diagonal_to_zero!(A::Array{Float64}) +function set_diagonal_to_zero!(A::Array) n = min(size(A)...) for i in 1:n - A[i, i] = 0.0 # change back to `Float32` + A[i, i] = zero(eltype(A)) end return nothing end diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl index 92a62d7..1b55564 100644 --- a/src/solvers/dg_1d.jl +++ b/src/solvers/dg_1d.jl @@ -121,14 +121,14 @@ function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, nonco if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) @inbounds begin - integral_contribution = 0.0 # change back to `Float32` + integral_contribution = zero(eltype(du)) for ii in axes(du, 2) du[i, j, k] += symmetric_flux_arr[i, j, ii, k] integral_contribution += derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k] end - du[i, j, k] += 0.5 * integral_contribution # change back to `Float32` + du[i, j, k] += 0.5f0 * integral_contribution end end @@ -260,8 +260,8 @@ function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, f @inbounds begin for ii in axes(u, 1) - fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_L_node[ii] - fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5 * f1_R_node[ii] + fstar1_L[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_L_node[ii] + fstar1_R[ii, j1, element_dgfv] = f1_node[ii] + 0.5f0 * f1_R_node[ii] end end end @@ -288,7 +288,7 @@ function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, @inbounds begin if element_dg != 0 # bad - integral_contribution = 0.0 + integral_contribution = zero(eltype(du)) for ii in axes(du, 2) du[i, j, element_dg] += volume_flux_arr[i, j, ii, element_dg] @@ -296,11 +296,11 @@ function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, noncons_flux_arr[i, j, ii, element_dg] end - du[i, j, element_dg] += 0.5 * integral_contribution + du[i, j, element_dg] += 0.5f0 * integral_contribution end if element_dgfv != 0 # bad - integral_contribution = 0.0 + integral_contribution = zero(eltype(du)) for ii in axes(du, 2) du[i, j, element_dgfv] += (1 - alpha_element) * @@ -309,7 +309,7 @@ function volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, noncons_flux_arr[i, j, ii, element_dgfv] end - du[i, j, element_dgfv] += 0.5 * (1 - alpha_element) * integral_contribution + du[i, j, element_dgfv] += 0.5f0 * (1 - alpha_element) * integral_contribution end end end @@ -436,9 +436,9 @@ function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_l @inbounds begin surface_flux_values[i, 2, left_id] = surface_flux_arr[i, j] + - 0.5 * noncons_left_arr[i, j] # change back to `Float32` + 0.5f0 * noncons_left_arr[i, j] surface_flux_values[i, 1, right_id] = surface_flux_arr[i, j] + - 0.5 * noncons_right_arr[i, j] # change back to `Float32` + 0.5f0 * noncons_right_arr[i, j] end end @@ -540,7 +540,7 @@ function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinat @inbounds begin for ii in axes(surface_flux_values, 1) surface_flux_values[ii, direction, neighbor] = flux_node[ii] + - 0.5 * noncons_flux_node[ii] + 0.5f0 * noncons_flux_node[ii] end end end @@ -610,7 +610,7 @@ end # Pack kernels for calculating volume integrals function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms, equations, volume_integral::VolumeIntegralWeakForm, dg::DGSEM, cache) - derivative_dhat = CuArray{Float64}(dg.basis.derivative_dhat) + derivative_dhat = CuArray(dg.basis.derivative_dhat) flux_arr = similar(u) flux_kernel = @cuda launch=false flux_kernel!(flux_arr, u, equations, flux) @@ -628,13 +628,14 @@ end function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::False, equations, volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM, cache) - volume_flux = volume_integral.volume_flux + RealT = eltype(du) + volume_flux = volume_integral.volume_flux derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` - derivative_split = CuArray{Float64}(derivative_split) - volume_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + derivative_split = CuArray(derivative_split) + volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations, volume_flux) @@ -652,14 +653,15 @@ end # Pack kernels for calculating volume integrals function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations, volume_integral::VolumeIntegralFluxDifferencing, dg::DGSEM, cache) - symmetric_flux, nonconservative_flux = dg.volume_integral.volume_flux + RealT = eltype(du) + symmetric_flux, nonconservative_flux = dg.volume_integral.volume_flux derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` - derivative_split = CuArray{Float64}(derivative_split) - symmetric_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - noncons_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + derivative_split = CuArray(derivative_split) + symmetric_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + noncons_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) symmetric_noncons_flux_kernel = @cuda launch=false symmetric_noncons_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, @@ -673,7 +675,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: kernel_configurator_2d(symmetric_noncons_flux_kernel, size(u, 2)^2, size(u, 3))...) - derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` + derivative_split = CuArray(dg.basis.derivative_split) # use original `derivative_split` volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, @@ -687,6 +689,8 @@ end # Pack kernels for calculating volume integrals function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::False, equations, volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache) + RealT = eltype(du) + volume_flux_dg, volume_flux_fv = dg.volume_integral.volume_flux_dg, dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator @@ -694,14 +698,13 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache - alpha = CuArray{Float64}(alpha) + alpha = CuArray(alpha) - # For `Float64`, this gives 1.8189894035458565e-12 - # For `Float32`, this gives 1.1920929f-5 - atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl + atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) + # See `pure_and_blended_element_ids!` in Trixi.jl pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, element_ids_dgfv, alpha, @@ -713,10 +716,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` - derivative_split = CuArray{Float64}(derivative_split) - volume_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + derivative_split = CuArray(derivative_split) + inverse_weights = CuArray(dg.basis.inverse_weights) + volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - inverse_weights = CuArray{Float64}(dg.basis.inverse_weights) fstar1_L = cache.fstar1_L fstar1_R = cache.fstar1_R @@ -755,6 +758,8 @@ end # Pack kernels for calculating volume integrals function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::True, equations, volume_integral::VolumeIntegralShockCapturingHG, dg::DGSEM, cache) + RealT = eltype(du) + volume_flux_dg, nonconservative_flux_dg = dg.volume_integral.volume_flux_dg volume_flux_fv, nonconservative_flux_fv = dg.volume_integral.volume_flux_fv indicator = dg.volume_integral.indicator @@ -762,14 +767,13 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: # TODO: Get copies of `u` and `du` on both device and host # FIXME: Scalar indexing on GPU arrays caused by using GPU cache alpha = indicator(Array(u), mesh, equations, dg, cache) # GPU cache - alpha = CuArray{Float64}(alpha) + alpha = CuArray(alpha) - # For `Float64`, this gives 1.8189894035458565e-12 - # For `Float32`, this gives 1.1920929f-5 - atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl + atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0)) element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) + # See `pure_and_blended_element_ids!` in Trixi.jl pure_blended_element_count_kernel = @cuda launch=false pure_blended_element_count_kernel!(element_ids_dg, element_ids_dgfv, alpha, @@ -781,11 +785,11 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` - derivative_split = CuArray{Float64}(derivative_split) - volume_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - noncons_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + derivative_split = CuArray(derivative_split) + inverse_weights = CuArray(dg.basis.inverse_weights) + volume_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) + noncons_flux_arr = CuArray{RealT}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - inverse_weights = CuArray{Float64}(dg.basis.inverse_weights) fstar1_L = cache.fstar1_L fstar1_R = cache.fstar1_R @@ -804,7 +808,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2, size(u, 3))...) - derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` + derivative_split = CuArray(dg.basis.derivative_split) # use original `derivative_split` volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, @@ -847,12 +851,14 @@ end # Pack kernels for calculating interface fluxes function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::False, equations, dg::DGSEM, cache) + RealT = real(dg) + surface_flux = dg.surface_integral.surface_flux neighbor_ids = cache.interfaces.neighbor_ids interfaces_u = cache.interfaces.u surface_flux_values = cache.elements.surface_flux_values - surface_flux_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) + surface_flux_arr = CuArray{RealT}(undef, size(interfaces_u)[2:end]...) surface_flux_kernel = @cuda launch=false surface_flux_kernel!(surface_flux_arr, interfaces_u, equations, surface_flux) @@ -873,15 +879,17 @@ end # Pack kernels for calculating interface fluxes function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::True, equations, dg::DGSEM, cache) + RealT = real(dg) + surface_flux, nonconservative_flux = dg.surface_integral.surface_flux neighbor_ids = cache.interfaces.neighbor_ids interfaces_u = cache.interfaces.u surface_flux_values = cache.elements.surface_flux_values - surface_flux_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - noncons_left_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - noncons_right_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) + surface_flux_arr = CuArray{RealT}(undef, size(interfaces_u)[2:end]...) + noncons_left_arr = CuArray{RealT}(undef, size(interfaces_u)[2:end]...) + noncons_right_arr = CuArray{RealT}(undef, size(interfaces_u)[2:end]...) surface_noncons_flux_kernel = @cuda launch=false surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, @@ -1029,10 +1037,10 @@ end # Pack kernels for calculating surface integrals function cuda_surface_integral!(du, mesh::TreeMesh{1}, equations, dg::DGSEM, cache) - factor_arr = CuArray{Float64}([ - dg.basis.boundary_interpolation[1, 1], - dg.basis.boundary_interpolation[size(du, 2), 2] - ]) + factor_arr = CuArray([ + dg.basis.boundary_interpolation[1, 1], + dg.basis.boundary_interpolation[size(du, 2), 2] + ]) surface_flux_values = cache.elements.surface_flux_values surface_integral_kernel = @cuda launch=false surface_integral_kernel!(du, factor_arr, diff --git a/test.jl b/test.jl new file mode 100644 index 0000000..a15d647 --- /dev/null +++ b/test.jl @@ -0,0 +1,94 @@ +using Trixi, TrixiCUDA + +RealT = Float32 # set precision + +equations = CompressibleEulerEquations1D(1.4f0) + +initial_condition = initial_condition_weak_blast_wave + +surface_flux = flux_lax_friedrichs +volume_flux = flux_shima_etal +basis = LobattoLegendreBasis(RealT, 3) +indicator_sc = IndicatorHennemannGassner(equations, basis, + alpha_max = 0.5f0, + alpha_min = 0.001f0, + alpha_smooth = true, + variable = density_pressure) +volume_integral = VolumeIntegralShockCapturingHG(indicator_sc; + volume_flux_dg = volume_flux, + volume_flux_fv = surface_flux) +solver = DGSEM(basis, surface_flux, volume_integral) + +coordinates_min = -2.0f0 +coordinates_max = 2.0f0 +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 5, + n_cells_max = 10_000, RealT = RealT) + +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) + +tspan = tspan_gpu = (0.0f0, 1.0f0) +t = t_gpu = 0.0f0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# # Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu = copy(ode_gpu.u0) +du_gpu = similar(u_gpu) + +TrixiCUDA.reset_du!(du_gpu) +Trixi.reset_du!(du, solver, cache) + +TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) +Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), + equations, solver.volume_integral, solver, cache) + +TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) +Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) + +TrixiCUDA.cuda_interface_flux!(mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) +Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, + Trixi.have_nonconservative_terms(equations), equations, + solver.surface_integral, solver, cache) + +TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, + equations_gpu, cache_gpu) +Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) + +TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) +Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, + solver.surface_integral, solver) + +TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) +Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, + solver, cache) + +TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu) +Trixi.apply_jacobian!(du, mesh, equations, solver, cache) + +TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, + equations_gpu, cache_gpu) +Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache)