From 471cb7dbb37bfa63e27c3a9013959b1cf87b29c1 Mon Sep 17 00:00:00 2001 From: Huiyu Xie Date: Thu, 3 Oct 2024 23:55:00 -1000 Subject: [PATCH] Optimize kernel configurators (#62) * Start * Complete 1D * Complete 2D * Complete 3D * Minor change --- src/auxiliary/configurators.jl | 84 +++++++++++--- src/solvers/dg_1d.jl | 109 ++++++++---------- src/solvers/dg_2d.jl | 181 +++++++++++++---------------- src/solvers/dg_3d.jl | 201 ++++++++++++++++----------------- test/runtests.jl | 2 +- test/test_macros.jl | 47 ++++---- test/test_script.jl | 107 ++++++------------ 7 files changed, 351 insertions(+), 380 deletions(-) diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl index a668f1c..c2e04a7 100644 --- a/src/auxiliary/configurators.jl +++ b/src/auxiliary/configurators.jl @@ -1,32 +1,86 @@ # Kernel configurators are used for determining the number of threads and # blocks to be used in the kernel, which optimizes the use of GPU resources. -# Kernel configurator for 1D CUDA array -function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1}) - config = launch_configuration(kernel.fun) +# 1D kernel configurator +# We hardcode 32 threads per block for 1D kernels +function kernel_configurator_1d(kernel::HostKernel, x::Int) + # config = launch_configuration(kernel.fun) # not used in this case - threads = min(length(array), config.threads) - blocks = cld(length(array), threads) + threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32 + blocks = cld(x, threads[1]) return (threads = threads, blocks = blocks) end -# Kernel configurator for 2D CUDA array -function configurator_2d(kernel::HostKernel, array::CuArray{<:Any, 2}) - config = launch_configuration(kernel.fun) +# 2D kernel configurator +# We hardcode 32 threads for x dimension per block, and y dimension is determined +# by the number of threads returned by the launch configuration +function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int) + config = launch_configuration(kernel.fun) # get the number of threads - threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 2))), 2)) - blocks = map(cld, size(array), threads) + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y) + blocks = (cld(x, threads[1]), cld(y, threads[2])) return (threads = threads, blocks = blocks) end -# Kernel configurator for 3D CUDA array -function configurator_3d(kernel::HostKernel, array::CuArray{<:Any, 3}) - config = launch_configuration(kernel.fun) +# 3D kernel configurator +# We hardcode 32 threads for x dimension per block, y and z dimensions are determined +# by the number of threads returned by the launch configuration +function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int) + config = launch_configuration(kernel.fun) # get the number of threads + + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) - threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 3))), 3)) - blocks = map(cld, size(array), threads) + # z dimension + dims_z1 = cld(x * y * z, 32 * dims_y) + dims_z2 = max(fld(config.threads, 32 * dims_y), 1) + + dims_z = min(dims_z1, dims_z2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y, dims_z) + blocks = (cld(x, threads[1]), cld(y, threads[2]), cld(z, threads[3])) return (threads = threads, blocks = blocks) end + +# Deprecated old kernel configurators below + +# function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1}) +# config = launch_configuration(kernel.fun) + +# threads = min(length(array), config.threads) +# blocks = cld(length(array), threads) + +# return (threads = threads, blocks = blocks) +# end + +# function configurator_2d(kernel::HostKernel, array::CuArray{<:Any, 2}) +# config = launch_configuration(kernel.fun) + +# threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 2))), 2)) +# blocks = map(cld, size(array), threads) + +# return (threads = threads, blocks = blocks) +# end + +# function configurator_3d(kernel::HostKernel, array::CuArray{<:Any, 3}) +# config = launch_configuration(kernel.fun) + +# threads = Tuple(fill(Int(floor((min(maximum(size(array)), config.threads))^(1 / 3))), 3)) +# blocks = map(cld, size(array), threads) + +# return (threads = threads, blocks = blocks) +# end diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl index cefff53..f9be04f 100644 --- a/src/solvers/dg_1d.jl +++ b/src/solvers/dg_1d.jl @@ -613,13 +613,13 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms, derivative_dhat = CuArray{Float64}(dg.basis.derivative_dhat) flux_arr = similar(u) - size_arr = CuArray{Float64}(undef, size(u, 2), size(u, 3)) - flux_kernel = @cuda launch=false flux_kernel!(flux_arr, u, equations, flux) - flux_kernel(flux_arr, u, equations, flux; configurator_2d(flux_kernel, size_arr)...) + flux_kernel(flux_arr, u, equations, flux; + kernel_configurator_2d(flux_kernel, size(u, 2), size(u, 3))...) weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr) - weak_form_kernel(du, derivative_dhat, flux_arr; configurator_3d(weak_form_kernel, du)...) + weak_form_kernel(du, derivative_dhat, flux_arr; + kernel_configurator_3d(weak_form_kernel, size(du)...)...) return nothing end @@ -636,17 +636,15 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: derivative_split = CuArray{Float64}(derivative_split) volume_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 3)) - volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...) + kernel_configurator_2d(volume_flux_kernel, size(u, 2)^2, size(u, 3))...) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, volume_flux_arr, equations) volume_integral_kernel(du, derivative_split, volume_flux_arr, equations; - configurator_3d(volume_integral_kernel, du)...) + kernel_configurator_3d(volume_integral_kernel, size(du)...)...) return nothing end @@ -663,8 +661,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: symmetric_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) noncons_flux_arr = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 3)) - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 3)) - symmetric_noncons_flux_kernel = @cuda launch=false symmetric_noncons_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u, @@ -674,7 +670,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: nonconservative_flux) symmetric_noncons_flux_kernel(symmetric_flux_arr, noncons_flux_arr, u, derivative_split, equations, symmetric_flux, nonconservative_flux; - configurator_2d(symmetric_noncons_flux_kernel, size_arr)...) + kernel_configurator_2d(symmetric_noncons_flux_kernel, + size(u, 2)^2, size(u, 3))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` @@ -682,7 +679,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: symmetric_flux_arr, noncons_flux_arr) volume_integral_kernel(du, derivative_split, symmetric_flux_arr, noncons_flux_arr; - configurator_3d(volume_integral_kernel, du)...) + kernel_configurator_3d(volume_integral_kernel, size(du)...)...) return nothing end @@ -702,7 +699,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -711,7 +707,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -723,8 +720,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: fstar1_L = cache.fstar1_L fstar1_R = cache.fstar1_R - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 3)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, element_ids_dgfv, @@ -732,7 +727,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: volume_flux_fv) volume_flux_dgfv_kernel(volume_flux_arr, fstar1_L, fstar1_R, u, element_ids_dgfv, equations, volume_flux_dg, volume_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2, + size(u, 3))...) volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, @@ -742,9 +738,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: equations) volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr, equations; - configurator_3d(volume_integral_dg_kernel, du)...) - - size_arr = CuArray{Float64}(undef, size(u, 2), size(u, 3)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -752,7 +746,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: element_ids_dgfv, alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2), + size(u, 3))...) return nothing end @@ -772,7 +767,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -781,7 +775,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -794,8 +789,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: fstar1_L = cache.fstar1_L fstar1_R = cache.fstar1_R - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 3)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u, @@ -808,7 +801,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: volume_flux_dgfv_kernel(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, u, element_ids_dgfv, derivative_split, equations, volume_flux_dg, nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^2, + size(u, 3))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` @@ -821,9 +815,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: equations) volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr, noncons_flux_arr, equations; - configurator_3d(volume_integral_dg_kernel, du)...) - - size_arr = CuArray{Float64}(undef, size(u, 2), size(u, 3)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du)...)...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -831,7 +823,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms:: element_ids_dgfv, alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2), + size(u, 3))...) return nothing end @@ -841,12 +834,12 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{1}, equations, cache) neighbor_ids = cache.interfaces.neighbor_ids interfaces_u = cache.interfaces.u - size_arr = CuArray{Float64}(undef, size(interfaces_u, 2), size(interfaces_u, 3)) - prolong_interfaces_kernel = @cuda launch=false prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids) prolong_interfaces_kernel(interfaces_u, u, neighbor_ids; - configurator_2d(prolong_interfaces_kernel, size_arr)...) + kernel_configurator_2d(prolong_interfaces_kernel, + size(interfaces_u, 2), + size(interfaces_u, 3))...) return nothing end @@ -859,22 +852,20 @@ function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::False, e neighbor_ids = cache.interfaces.neighbor_ids interfaces_u = cache.interfaces.u surface_flux_values = cache.elements.surface_flux_values - surface_flux_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3)) surface_flux_kernel = @cuda launch=false surface_flux_kernel!(surface_flux_arr, interfaces_u, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, equations, surface_flux; - configurator_1d(surface_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)) + kernel_configurator_1d(surface_flux_kernel, size(interfaces_u, 3))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids) interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids; - configurator_2d(interface_flux_kernel, size_arr)...) + kernel_configurator_2d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3))...) return nothing end @@ -892,8 +883,6 @@ function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::True, eq noncons_left_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) noncons_right_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3)) - surface_noncons_flux_kernel = @cuda launch=false surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, @@ -903,9 +892,8 @@ function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::True, eq nonconservative_flux) surface_noncons_flux_kernel(surface_flux_arr, noncons_left_arr, noncons_right_arr, interfaces_u, equations, surface_flux, nonconservative_flux; - configurator_1d(surface_noncons_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)) + kernel_configurator_1d(surface_noncons_flux_kernel, + size(interfaces_u, 3))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, @@ -914,7 +902,9 @@ function cuda_interface_flux!(mesh::TreeMesh{1}, nonconservative_terms::True, eq neighbor_ids) interface_flux_kernel(surface_flux_values, surface_flux_arr, noncons_left_arr, noncons_right_arr, neighbor_ids; - configurator_2d(interface_flux_kernel, size_arr)...) + kernel_configurator_2d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3))...) return nothing end @@ -932,13 +922,13 @@ function cuda_prolong2boundaries!(u, mesh::TreeMesh{1}, boundary_conditions::Nam neighbor_sides = cache.boundaries.neighbor_sides boundaries_u = cache.boundaries.u - size_arr = CuArray{Float64}(undef, size(boundaries_u, 2), size(boundaries_u, 3)) - prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides) prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides; - configurator_2d(prolong_boundaries_kernel, size_arr)...) + kernel_configurator_2d(prolong_boundaries_kernel, + size(boundaries_u, 2), + size(boundaries_u, 3))...) return nothing end @@ -970,11 +960,10 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_conditions::NamedTup last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; - configurator_1d(last_first_indices_kernel, lasts)...) + kernel_configurator_1d(last_first_indices_kernel, length(lasts))...) - indices_arr = firsts boundary_arr = CuArray{Int}(Array(firsts)[1]:Array(lasts)[end]) - + indices_arr = firsts boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, @@ -988,7 +977,7 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_conditions::NamedTup boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, indices_arr, neighbor_ids, neighbor_sides, orientations, boundary_conditions_callable, equations, surface_flux; - configurator_1d(boundary_flux_kernel, boundary_arr)...) + kernel_configurator_1d(boundary_flux_kernel, length(boundary_arr))...) return nothing end @@ -1014,11 +1003,10 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_conditions::NamedTup last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; - configurator_1d(last_first_indices_kernel, lasts)...) + kernel_configurator_1d(last_first_indices_kernel, length(lasts))...) - indices_arr = firsts boundary_arr = CuArray{Int}(Array(firsts)[1]:Array(lasts)[end]) - + indices_arr = firsts boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, @@ -1034,7 +1022,7 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{1}, boundary_conditions::NamedTup indices_arr, neighbor_ids, neighbor_sides, orientations, boundary_conditions_callable, equations, surface_flux, nonconservative_flux; - configurator_1d(boundary_flux_kernel, boundary_arr)...) + kernel_configurator_1d(boundary_flux_kernel, length(boundary_arr))...) return nothing end @@ -1051,7 +1039,7 @@ function cuda_surface_integral!(du, mesh::TreeMesh{1}, equations, dg::DGSEM, cac surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, du)...) + kernel_configurator_3d(surface_integral_kernel, size(du)...)...) return nothing end @@ -1061,7 +1049,8 @@ function cuda_jacobian!(du, mesh::TreeMesh{1}, equations, cache) inverse_jacobian = cache.elements.inverse_jacobian jacobian_kernel = @cuda launch=false jacobian_kernel!(du, inverse_jacobian, equations) - jacobian_kernel(du, inverse_jacobian, equations; configurator_3d(jacobian_kernel, du)...) + jacobian_kernel(du, inverse_jacobian, equations; + kernel_configurator_3d(jacobian_kernel, size(du)...)...) return nothing end @@ -1075,12 +1064,10 @@ end function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{1}, cache) node_coordinates = cache.elements.node_coordinates - size_arr = CuArray{Float64}(undef, size(du, 2), size(du, 3)) - source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...) + kernel_configurator_2d(source_terms_kernel, size(du, 2), size(du, 3))...) return nothing end diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index 8444764..c01c251 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -1013,17 +1013,15 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms, flux_arr1 = similar(u) flux_arr2 = similar(u) - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) - flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, u, equations, flux) - flux_kernel(flux_arr1, flux_arr2, u, equations, flux; configurator_2d(flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) + flux_kernel(flux_arr1, flux_arr2, u, equations, flux; + kernel_configurator_2d(flux_kernel, size(u, 2)^2, size(u, 4))...) weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2) weak_form_kernel(du, derivative_dhat, flux_arr1, flux_arr2; - configurator_3d(weak_form_kernel, size_arr)...) + kernel_configurator_3d(weak_form_kernel, size(du, 1), size(du, 2)^2, + size(du, 4))...) return nothing end @@ -1042,20 +1040,17 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_flux_arr2 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 4)) - volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) + kernel_configurator_2d(volume_flux_kernel, size(u, 2)^3, size(u, 4))...) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2, equations) volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2, equations; - configurator_3d(volume_integral_kernel, size_arr)...) + kernel_configurator_3d(volume_integral_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) return nothing end @@ -1078,8 +1073,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: noncons_flux_arr2 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 4)) - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 4)) - symmetric_noncons_flux_kernel = @cuda launch=false symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1, @@ -1092,10 +1085,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: symmetric_noncons_flux_kernel(symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, u, derivative_split, equations, symmetric_flux, nonconservative_flux; - configurator_2d(symmetric_noncons_flux_kernel, size_arr)...) + kernel_configurator_2d(symmetric_noncons_flux_kernel, + size(u, 2)^3, size(u, 4))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, @@ -1104,7 +1097,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: noncons_flux_arr2) volume_integral_kernel(du, derivative_split, symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1, noncons_flux_arr2; - configurator_3d(volume_integral_kernel, size_arr)...) + kernel_configurator_3d(volume_integral_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) return nothing end @@ -1124,7 +1118,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -1133,7 +1126,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -1150,8 +1144,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: fstar2_L = cache.fstar2_L fstar2_R = cache.fstar2_R - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 4)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, @@ -1163,9 +1155,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, element_ids_dgfv, equations, volume_flux_dg, volume_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3, + size(u, 4))...) volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, @@ -1176,9 +1167,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: equations) volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr1, volume_flux_arr2, equations; - configurator_3d(volume_integral_dg_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -1188,7 +1178,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2, + size(u, 4))...) return nothing end @@ -1208,7 +1199,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -1217,7 +1207,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -1238,8 +1229,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: fstar2_L = cache.fstar2_L fstar2_R = cache.fstar2_R - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 4)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, @@ -1257,12 +1246,11 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, element_ids_dgfv, derivative_split, equations, volume_flux_dg, nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^3, + size(u, 4))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, @@ -1275,9 +1263,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, noncons_flux_arr2, equations; - configurator_3d(volume_integral_dg_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -1287,7 +1274,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms:: alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^2, + size(u, 4))...) return nothing end @@ -1298,15 +1286,14 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{2}, equations, cache) orientations = cache.interfaces.orientations interfaces_u = cache.interfaces.u - size_arr = CuArray{Float64}(undef, size(interfaces_u, 2) * size(interfaces_u, 3), - size(interfaces_u, 4)) - prolong_interfaces_kernel = @cuda launch=false prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, equations) prolong_interfaces_kernel(interfaces_u, u, neighbor_ids, orientations, equations; - configurator_2d(prolong_interfaces_kernel, size_arr)...) + kernel_configurator_2d(prolong_interfaces_kernel, + size(interfaces_u, 2) * size(interfaces_u, 3), + size(interfaces_u, 4))...) return nothing end @@ -1320,18 +1307,14 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::False, e orientations = cache.interfaces.orientations interfaces_u = cache.interfaces.u surface_flux_values = cache.elements.surface_flux_values - surface_flux_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3), size(interfaces_u, 4)) surface_flux_kernel = @cuda launch=false surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, orientations, equations, surface_flux; - configurator_2d(surface_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3), - size(interfaces_u, 4)) + kernel_configurator_2d(surface_flux_kernel, size(interfaces_u, 3), + size(interfaces_u, 4))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, @@ -1339,7 +1322,10 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::False, e equations) interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...) + kernel_configurator_3d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3), + size(interfaces_u, 4))...) return nothing end @@ -1358,8 +1344,6 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::True, eq noncons_left_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) noncons_right_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3), size(interfaces_u, 4)) - surface_noncons_flux_kernel = @cuda launch=false surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, @@ -1370,10 +1354,9 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::True, eq nonconservative_flux) surface_noncons_flux_kernel(surface_flux_arr, noncons_left_arr, noncons_right_arr, interfaces_u, orientations, equations, surface_flux, nonconservative_flux; - configurator_2d(surface_noncons_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3), - size(interfaces_u, 4)) + kernel_configurator_2d(surface_noncons_flux_kernel, + size(interfaces_u, 3), + size(interfaces_u, 4))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, @@ -1383,7 +1366,10 @@ function cuda_interface_flux!(mesh::TreeMesh{2}, nonconservative_terms::True, eq equations) interface_flux_kernel(surface_flux_values, surface_flux_arr, noncons_left_arr, noncons_right_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...) + kernel_configurator_3d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3), + size(interfaces_u, 4))...) return nothing end @@ -1402,9 +1388,6 @@ function cuda_prolong2boundaries!(u, mesh::TreeMesh{2}, boundary_conditions::Nam orientations = cache.boundaries.orientations boundaries_u = cache.boundaries.u - size_arr = CuArray{Float64}(undef, size(boundaries_u, 2) * size(boundaries_u, 3), - size(boundaries_u, 4)) - prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, @@ -1412,7 +1395,9 @@ function cuda_prolong2boundaries!(u, mesh::TreeMesh{2}, boundary_conditions::Nam equations) prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, equations; - configurator_2d(prolong_boundaries_kernel, size_arr)...) + kernel_configurator_2d(prolong_boundaries_kernel, + size(boundaries_u, 2) * size(boundaries_u, 3), + size(boundaries_u, 4))...) return nothing end @@ -1444,13 +1429,11 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTup last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; - configurator_1d(last_first_indices_kernel, lasts)...) + kernel_configurator_1d(last_first_indices_kernel, length(lasts))...) - indices_arr = firsts boundary_arr = CuArray{Int}(Array(firsts)[1]:Array(lasts)[end]) - + indices_arr = firsts boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 2), length(boundary_arr)) boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, @@ -1463,7 +1446,8 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTup boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, indices_arr, neighbor_ids, neighbor_sides, orientations, boundary_conditions_callable, equations, surface_flux; - configurator_2d(boundary_flux_kernel, size_arr)...) + kernel_configurator_2d(boundary_flux_kernel, size(surface_flux_values, 2), + length(boundary_arr))...) return nothing end @@ -1489,13 +1473,11 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTup last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; - configurator_1d(last_first_indices_kernel, lasts)...) + kernel_configurator_1d(last_first_indices_kernel, length(lasts))...) - indices_arr = firsts boundary_arr = CuArray{Int}(Array(firsts)[1]:Array(lasts)[end]) - + indices_arr = firsts boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 2), length(boundary_arr)) boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, @@ -1510,7 +1492,8 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{2}, boundary_conditions::NamedTup indices_arr, neighbor_ids, neighbor_sides, orientations, boundary_conditions_callable, equations, surface_flux, nonconservative_flux; - configurator_2d(boundary_flux_kernel, size_arr)...) + kernel_configurator_2d(boundary_flux_kernel, size(surface_flux_values, 2), + length(boundary_arr))...) return nothing end @@ -1529,12 +1512,9 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::True, dg::DG # The original CPU arrays hold NaNs u_upper = cache.mortars.u_upper u_lower = cache.mortars.u_lower - forward_upper = CuArray{Float64}(dg.mortar.forward_upper) forward_lower = CuArray{Float64}(dg.mortar.forward_lower) - size_arr = CuArray{Float64}(undef, size(u_upper, 2), size(u_upper, 3), size(u_upper, 4)) - prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper, u_lower, u, @@ -1542,8 +1522,9 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::True, dg::DG large_sides, orientations) prolong_mortars_small2small_kernel(u_upper, u_lower, u, neighbor_ids, large_sides, orientations; - configurator_3d(prolong_mortars_small2small_kernel, - size_arr)...) + kernel_configurator_3d(prolong_mortars_small2small_kernel, + size(u_upper, 2), size(u_upper, 3), + size(u_upper, 4))...) prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper, u_lower, @@ -1555,8 +1536,9 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{2}, cache_mortars::True, dg::DG orientations) prolong_mortars_large2small_kernel(u_upper, u_lower, u, forward_upper, forward_lower, neighbor_ids, large_sides, orientations; - configurator_3d(prolong_mortars_large2small_kernel, - size_arr)...) + kernel_configurator_3d(prolong_mortars_large2small_kernel, + size(u_upper, 2), size(u_upper, 3), + size(u_upper, 4))...) return nothing end @@ -1584,21 +1566,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati surface_flux_values = cache.elements.surface_flux_values tmp_surface_flux_values = zero(similar(surface_flux_values)) - fstar_upper = cache.fstar_upper fstar_lower = cache.fstar_lower - size_arr = CuArray{Float64}(undef, size(u_upper, 3), length(orientations)) - mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper, fstar_lower, u_upper, u_lower, orientations, equations, surface_flux) mortar_flux_kernel(fstar_upper, fstar_lower, u_upper, u_lower, orientations, equations, surface_flux; - configurator_2d(mortar_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2), - length(orientations)) + kernel_configurator_2d(mortar_flux_kernel, size(u_upper, 3), + length(orientations))...) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, @@ -1612,7 +1589,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_upper, fstar_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2), + length(orientations))...) return nothing end @@ -1634,22 +1614,17 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati surface_flux_values = cache.elements.surface_flux_values tmp_surface_flux_values = zero(similar(surface_flux_values)) - fstar_upper = cache.fstar_upper fstar_lower = cache.fstar_lower - size_arr = CuArray{Float64}(undef, size(u_upper, 3), length(orientations)) - mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper, fstar_lower, u_upper, u_lower, orientations, large_sides, equations, surface_flux, nonconservative_flux) mortar_flux_kernel(fstar_upper, fstar_lower, u_upper, u_lower, orientations, large_sides, equations, surface_flux, nonconservative_flux; - configurator_2d(mortar_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2), - length(orientations)) + kernel_configurator_2d(mortar_flux_kernel, size(u_upper, 3), + length(orientations))...) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, @@ -1663,7 +1638,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_upper, fstar_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2), + length(orientations))...) return nothing end @@ -1676,13 +1654,12 @@ function cuda_surface_integral!(du, mesh::TreeMesh{2}, equations, dg::DGSEM, cac ]) surface_flux_values = cache.elements.surface_flux_values - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) - surface_integral_kernel = @cuda launch=false surface_integral_kernel!(du, factor_arr, surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, size_arr)...) + kernel_configurator_3d(surface_integral_kernel, size(du, 1), + size(du, 2)^2, size(du, 4))...) return nothing end @@ -1691,10 +1668,10 @@ end function cuda_jacobian!(du, mesh::TreeMesh{2}, equations, cache) inverse_jacobian = cache.elements.inverse_jacobian - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^2, size(du, 4)) - jacobian_kernel = @cuda launch=false jacobian_kernel!(du, inverse_jacobian, equations) - jacobian_kernel(du, inverse_jacobian, equations; configurator_3d(jacobian_kernel, size_arr)...) + jacobian_kernel(du, inverse_jacobian, equations; + kernel_configurator_3d(jacobian_kernel, size(du, 1), size(du, 2)^2, + size(du, 4))...) return nothing end @@ -1708,12 +1685,10 @@ end function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{2}, cache) node_coordinates = cache.elements.node_coordinates - size_arr = CuArray{Float64}(undef, size(u, 2)^2, size(u, 4)) - source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...) + kernel_configurator_2d(source_terms_kernel, size(u, 2)^2, size(u, 4))...) return nothing end diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 869e378..7fbf3b1 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -1386,19 +1386,16 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms, flux_arr2 = similar(u) flux_arr3 = similar(u) - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) - flux_kernel = @cuda launch=false flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations, flux) flux_kernel(flux_arr1, flux_arr2, flux_arr3, u, equations, flux; - configurator_2d(flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) + kernel_configurator_2d(flux_kernel, size(u, 2)^3, size(u, 5))...) weak_form_kernel = @cuda launch=false weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3) weak_form_kernel(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3; - configurator_3d(weak_form_kernel, size_arr)...) + kernel_configurator_3d(weak_form_kernel, size(du, 1), size(du, 2)^3, + size(du, 5))...) return nothing end @@ -1419,16 +1416,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_arr3 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - size_arr = CuArray{Float64}(undef, size(u, 2)^4, size(u, 5)) - volume_flux_kernel = @cuda launch=false volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations, volume_flux) volume_flux_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, equations, volume_flux; - configurator_2d(volume_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) + kernel_configurator_2d(volume_flux_kernel, size(u, 2)^4, size(u, 5))...) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, volume_flux_arr1, @@ -1436,7 +1429,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_arr3, equations) volume_integral_kernel(du, derivative_split, volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, equations; - configurator_3d(volume_integral_kernel, size_arr)...) + kernel_configurator_3d(volume_integral_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) return nothing end @@ -1463,8 +1457,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: noncons_flux_arr3 = CuArray{Float64}(undef, size(u, 1), size(u, 2), size(u, 2), size(u, 2), size(u, 2), size(u, 5)) - size_arr = CuArray{Float64}(undef, size(u, 2)^4, size(u, 5)) - symmetric_noncons_flux_kernel = @cuda launch=false symmetric_noncons_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, @@ -1479,10 +1471,10 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: symmetric_noncons_flux_kernel(symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, u, derivative_split, equations, symmetric_flux, nonconservative_flux; - configurator_2d(symmetric_noncons_flux_kernel, size_arr)...) + kernel_configurator_2d(symmetric_noncons_flux_kernel, + size(u, 2)^4, size(u, 5))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) volume_integral_kernel = @cuda launch=false volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, @@ -1494,7 +1486,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_integral_kernel(du, derivative_split, symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3; - configurator_3d(volume_integral_kernel, size_arr)...) + kernel_configurator_3d(volume_integral_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) return nothing end @@ -1514,7 +1507,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -1523,7 +1515,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -1544,8 +1537,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: fstar3_L = cache.fstar3_L fstar3_R = cache.fstar3_R - size_arr = CuArray{Float64}(undef, size(u, 2)^4, size(u, 5)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, @@ -1560,9 +1551,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_flux_dgfv_kernel(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, element_ids_dgfv, equations, volume_flux_dg, volume_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4, + size(u, 5))...) volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, @@ -1574,9 +1564,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: equations) volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, equations; - configurator_3d(volume_integral_dg_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -1587,7 +1576,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3, + size(u, 5))...) return nothing end @@ -1607,7 +1597,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: # For `Float64`, this gives 1.8189894035458565e-12 # For `Float32`, this gives 1.1920929f-5 atol = 1.8189894035458565e-12 # see also `pure_and_blended_element_ids!` in Trixi.jl - element_ids_dg = CUDA.zeros(Int, length(alpha)) element_ids_dgfv = CUDA.zeros(Int, length(alpha)) @@ -1616,7 +1605,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: alpha, atol) pure_blended_element_count_kernel(element_ids_dg, element_ids_dgfv, alpha, atol; - configurator_1d(pure_blended_element_count_kernel, alpha)...) + kernel_configurator_1d(pure_blended_element_count_kernel, + length(alpha))...) derivative_split = dg.basis.derivative_split set_diagonal_to_zero!(derivative_split) # temporarily set here, maybe move outside `rhs!` @@ -1643,8 +1633,6 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: fstar3_L = cache.fstar3_L fstar3_R = cache.fstar3_R - size_arr = CuArray{Float64}(undef, size(u, 2)^4, size(u, 5)) - volume_flux_dgfv_kernel = @cuda launch=false volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, @@ -1666,12 +1654,11 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, element_ids_dgfv, derivative_split, equations, volume_flux_dg, nonconservative_flux_dg, volume_flux_fv, nonconservative_flux_fv; - configurator_2d(volume_flux_dgfv_kernel, size_arr)...) + kernel_configurator_2d(volume_flux_dgfv_kernel, size(u, 2)^4, + size(u, 5))...) derivative_split = CuArray{Float64}(dg.basis.derivative_split) # use original `derivative_split` - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) - volume_integral_dg_kernel = @cuda launch=false volume_integral_dg_kernel!(du, element_ids_dg, element_ids_dgfv, alpha, @@ -1686,9 +1673,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: volume_integral_dg_kernel(du, element_ids_dg, element_ids_dgfv, alpha, derivative_split, volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, equations; - configurator_3d(volume_integral_dg_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) + kernel_configurator_3d(volume_integral_dg_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) volume_integral_fv_kernel = @cuda launch=false volume_integral_fv_kernel!(du, fstar1_L, fstar1_R, @@ -1699,7 +1685,8 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms:: alpha) volume_integral_fv_kernel(du, fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, inverse_weights, element_ids_dgfv, alpha; - configurator_2d(volume_integral_fv_kernel, size_arr)...) + kernel_configurator_2d(volume_integral_fv_kernel, size(u, 2)^3, + size(u, 5))...) return nothing end @@ -1710,15 +1697,15 @@ function cuda_prolong2interfaces!(u, mesh::TreeMesh{3}, equations, cache) orientations = cache.interfaces.orientations interfaces_u = cache.interfaces.u - size_arr = CuArray{Float64}(undef, size(interfaces_u, 2) * size(interfaces_u, 3)^2, - size(interfaces_u, 5)) - prolong_interfaces_kernel = @cuda launch=false prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, equations) prolong_interfaces_kernel(interfaces_u, u, neighbor_ids, orientations, equations; - configurator_2d(prolong_interfaces_kernel, size_arr)...) + kernel_configurator_2d(prolong_interfaces_kernel, + size(interfaces_u, 2) * + size(interfaces_u, 3)^2, + size(interfaces_u, 5))...) return nothing end @@ -1732,19 +1719,15 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::False, e orientations = cache.interfaces.orientations interfaces_u = cache.interfaces.u surface_flux_values = cache.elements.surface_flux_values - surface_flux_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3), size(interfaces_u, 4), - size(interfaces_u, 5)) surface_flux_kernel = @cuda launch=false surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, equations, surface_flux) surface_flux_kernel(surface_flux_arr, interfaces_u, orientations, equations, surface_flux; - configurator_3d(surface_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)^2, - size(interfaces_u, 5)) + kernel_configurator_3d(surface_flux_kernel, size(interfaces_u, 3), + size(interfaces_u, 4), + size(interfaces_u, 5))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, @@ -1752,7 +1735,10 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::False, e equations) interface_flux_kernel(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...) + kernel_configurator_3d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3)^2, + size(interfaces_u, 5))...) return nothing end @@ -1771,9 +1757,6 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::True, eq noncons_left_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) noncons_right_arr = CuArray{Float64}(undef, size(interfaces_u)[2:end]...) - size_arr = CuArray{Float64}(undef, size(interfaces_u, 3), size(interfaces_u, 4), - size(interfaces_u, 5)) - surface_noncons_flux_kernel = @cuda launch=false surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, @@ -1784,10 +1767,10 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::True, eq nonconservative_flux) surface_noncons_flux_kernel(surface_flux_arr, noncons_left_arr, noncons_right_arr, interfaces_u, orientations, equations, surface_flux, nonconservative_flux; - configurator_3d(surface_noncons_flux_kernel, size_arr)...) - - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(interfaces_u, 3)^2, - size(interfaces_u, 5)) + kernel_configurator_3d(surface_noncons_flux_kernel, + size(interfaces_u, 3), + size(interfaces_u, 4), + size(interfaces_u, 5))...) interface_flux_kernel = @cuda launch=false interface_flux_kernel!(surface_flux_values, surface_flux_arr, @@ -1798,7 +1781,10 @@ function cuda_interface_flux!(mesh::TreeMesh{3}, nonconservative_terms::True, eq interface_flux_kernel(surface_flux_values, surface_flux_arr, noncons_left_arr, noncons_right_arr, neighbor_ids, orientations, equations; - configurator_3d(interface_flux_kernel, size_arr)...) + kernel_configurator_3d(interface_flux_kernel, + size(surface_flux_values, 1), + size(interfaces_u, 3)^2, + size(interfaces_u, 5))...) return nothing end @@ -1817,9 +1803,6 @@ function cuda_prolong2boundaries!(u, mesh::TreeMesh{3}, boundary_conditions::Nam orientations = cache.boundaries.orientations boundaries_u = cache.boundaries.u - size_arr = CuArray{Float64}(undef, size(boundaries_u, 2) * size(boundaries_u, 3)^2, - size(boundaries_u, 5)) - prolong_boundaries_kernel = @cuda launch=false prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, @@ -1827,7 +1810,10 @@ function cuda_prolong2boundaries!(u, mesh::TreeMesh{3}, boundary_conditions::Nam equations) prolong_boundaries_kernel(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, equations; - configurator_2d(prolong_boundaries_kernel, size_arr)...) + kernel_configurator_2d(prolong_boundaries_kernel, + size(boundaries_u, 2) * + size(boundaries_u, 3)^2, + size(boundaries_u, 5))...) return nothing end @@ -1859,13 +1845,11 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTup last_first_indices_kernel = @cuda launch=false last_first_indices_kernel!(lasts, firsts, n_boundaries_per_direction) last_first_indices_kernel(lasts, firsts, n_boundaries_per_direction; - configurator_1d(last_first_indices_kernel, lasts)...) + kernel_configurator_1d(last_first_indices_kernel, length(lasts))...) - indices_arr = firsts boundary_arr = CuArray{Int}(Array(firsts)[1]:Array(lasts)[end]) - + indices_arr = firsts boundary_conditions_callable = replace_boundary_conditions(boundary_conditions) - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 2)^2, length(boundary_arr)) boundary_flux_kernel = @cuda launch=false boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, @@ -1877,7 +1861,9 @@ function cuda_boundary_flux!(t, mesh::TreeMesh{3}, boundary_conditions::NamedTup boundary_flux_kernel(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, indices_arr, neighbor_ids, neighbor_sides, orientations, boundary_conditions_callable, equations, surface_flux; - configurator_2d(boundary_flux_kernel, size_arr)...) + kernel_configurator_2d(boundary_flux_kernel, + size(surface_flux_values, 2)^2, + length(boundary_arr))...) return nothing end @@ -1898,13 +1884,9 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG u_upper_right = cache.mortars.u_upper_right u_lower_left = cache.mortars.u_lower_left u_lower_right = cache.mortars.u_lower_right - forward_upper = CuArray{Float64}(dg.mortar.forward_upper) forward_lower = CuArray{Float64}(dg.mortar.forward_lower) - size_arr = CuArray{Float64}(undef, size(u_upper_left, 2), size(u_upper_left, 3)^2, - size(u_upper_left, 5)) - prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lower_left, @@ -1915,8 +1897,11 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG orientations) prolong_mortars_small2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, u, neighbor_ids, large_sides, orientations; - configurator_3d(prolong_mortars_small2small_kernel, - size_arr)...) + kernel_configurator_3d(prolong_mortars_small2small_kernel, + size(u_upper_left, 2), + size(u_upper_left, 3)^2, + size(u_upper_left, 5))...) + tmp_upper_left = zero(similar(u_upper_left)) # undef to zero tmp_upper_right = zero(similar(u_upper_right)) # undef to zero tmp_lower_left = zero(similar(u_lower_left)) # undef to zero @@ -1936,8 +1921,10 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, u, forward_upper, forward_lower, neighbor_ids, large_sides, orientations; - configurator_3d(prolong_mortars_large2small_kernel, - size_arr)...) + kernel_configurator_3d(prolong_mortars_large2small_kernel, + size(u_upper_left, 2), + size(u_upper_left, 3)^2, + size(u_upper_left, 5))...) prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, @@ -1954,8 +1941,10 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, forward_upper, forward_lower, large_sides; - configurator_3d(prolong_mortars_large2small_kernel, - size_arr)...) + kernel_configurator_3d(prolong_mortars_large2small_kernel, + size(u_upper_left, 2), + size(u_upper_left, 3)^2, + size(u_upper_left, 5))...) return nothing end @@ -1991,9 +1980,6 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati fstar_lower_left = cache.fstar_lower_left fstar_lower_right = cache.fstar_lower_right - size_arr = CuArray{Float64}(undef, size(u_upper_left, 3), size(u_upper_left, 4), - length(orientations)) - mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, u_upper_left, u_upper_right, @@ -2003,16 +1989,15 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati mortar_flux_kernel(fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, equations, surface_flux; - configurator_3d(mortar_flux_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), + size(u_upper_left, 4), + length(orientations))...) tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2)^2, - length(orientations)) - # TODO: Combine these two kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, @@ -2032,7 +2017,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_right, fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, @@ -2048,7 +2036,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end @@ -2072,15 +2063,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati surface_flux_values = cache.elements.surface_flux_values tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero - fstar_upper_left = cache.fstar_upper_left fstar_upper_right = cache.fstar_upper_right fstar_lower_left = cache.fstar_lower_left fstar_lower_right = cache.fstar_lower_right - size_arr = CuArray{Float64}(undef, size(u_upper_left, 3), size(u_upper_left, 4), - length(orientations)) - mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, u_upper_left, u_upper_right, @@ -2091,16 +2078,15 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati mortar_flux_kernel(fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, large_sides, equations, surface_flux, nonconservative_flux; - configurator_3d(mortar_flux_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), + size(u_upper_left, 4), + length(orientations))...) tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - size_arr = CuArray{Float64}(undef, size(surface_flux_values, 1), size(surface_flux_values, 2)^2, - length(orientations)) - # TODO: Combine these two kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, @@ -2120,7 +2106,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_right, fstar_upper_left, fstar_upper_right, fstar_lower_left, fstar_lower_right, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, @@ -2136,7 +2125,10 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; - configurator_3d(mortar_flux_copy_to_kernel, size_arr)...) + kernel_configurator_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end @@ -2149,13 +2141,12 @@ function cuda_surface_integral!(du, mesh::TreeMesh{3}, equations, dg::DGSEM, cac ]) surface_flux_values = cache.elements.surface_flux_values - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) - surface_integral_kernel = @cuda launch=false surface_integral_kernel!(du, factor_arr, surface_flux_values, equations) surface_integral_kernel(du, factor_arr, surface_flux_values, equations; - configurator_3d(surface_integral_kernel, size_arr)...) + kernel_configurator_3d(surface_integral_kernel, size(du, 1), + size(du, 2)^3, size(du, 5))...) return nothing end @@ -2164,10 +2155,10 @@ end function cuda_jacobian!(du, mesh::TreeMesh{3}, equations, cache) inverse_jacobian = cache.elements.inverse_jacobian - size_arr = CuArray{Float64}(undef, size(du, 1), size(du, 2)^3, size(du, 5)) - jacobian_kernel = @cuda launch=false jacobian_kernel!(du, inverse_jacobian, equations) - jacobian_kernel(du, inverse_jacobian, equations; configurator_3d(jacobian_kernel, size_arr)...) + jacobian_kernel(du, inverse_jacobian, equations; + kernel_configurator_3d(jacobian_kernel, size(du, 1), size(du, 2)^3, + size(du, 5))...) return nothing end @@ -2181,12 +2172,10 @@ end function cuda_sources!(du, u, t, source_terms, equations::AbstractEquations{3}, cache) node_coordinates = cache.elements.node_coordinates - size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) - source_terms_kernel = @cuda launch=false source_terms_kernel!(du, u, node_coordinates, t, equations, source_terms) source_terms_kernel(du, u, node_coordinates, t, equations, source_terms; - configurator_2d(source_terms_kernel, size_arr)...) + kernel_configurator_2d(source_terms_kernel, size(u, 2)^3, size(u, 5))...) return nothing end diff --git a/test/runtests.jl b/test/runtests.jl index aa306ac..bb08b35 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,4 +8,4 @@ using Test: @testset # include("./tree_dgsem_3d/tree_dgsem_3d.jl") end -end +end # module diff --git a/test/test_macros.jl b/test/test_macros.jl index a084f71..f3de377 100644 --- a/test/test_macros.jl +++ b/test/test_macros.jl @@ -64,8 +64,8 @@ macro test_approx(expr) # Direct comparison @test _array1 ≈ _array2 - else # one has NaN and the other does not - # Truth table + + # Truth table for below cases # ------------------------------- # Entry | Entry | Result # ------------------------------- @@ -74,29 +74,38 @@ macro test_approx(expr) # non-NaN | zero | 1 # non-NaN | non-zero | 1 # ------------------------------- - if has_nan_arr1 - # Condition 1: Check truth table above - local cond1 = all(.!(isnan.(_array1) .&& (_array2 .!= 0.0))) + elseif has_nan_arr1 # only the first array has NaN + # Condition 1: Check truth table above + local cond1 = all(.!(isnan.(_array1) .&& (_array2 .!= 0.0))) - # Replace NaNs with 0.0 - local __array1 = replace(_array1, NaN => 0.0) + # Replace NaNs with 0.0 + local __array1 = replace(_array1, NaN => 0.0) - # Condition 2: Check if the arrays are approximately equal - local cond2 = _array2 ≈ __array1 + # Condition 2: Check if the arrays are approximately equal + local cond2 = _array2 ≈ __array1 - @test cond1 && cond2 - else # has_nan_arr2 - # Condition 1: Check truth table above - local cond1 = all(.!(isnan.(_array2) .&& (_array1 .!= 0.0))) + @test cond1 && cond2 + elseif has_nan_arr2 # only the second array has NaN + # Condition 1: Check truth table above + local cond1 = all(.!(isnan.(_array2) .&& (_array1 .!= 0.0))) - # Replace NaNs with 0.0 - local __array2 = replace(_array2, NaN => 0.0) + # Replace NaNs with 0.0 + local __array2 = replace(_array2, NaN => 0.0) - # Condition 2: Check if the arrays are approximately equal - local cond2 = _array1 ≈ __array2 + # Condition 2: Check if the arrays are approximately equal + local cond2 = _array1 ≈ __array2 - @test cond1 && cond2 - end + @test cond1 && cond2 end end end + +# Truth table +# ------------------------------- +# Entry | Entry | Result +# ------------------------------- +# NaN | zero | 1 +# NaN | non-zero | 0 +# non-NaN | zero | 1 +# non-NaN | non-zero | 1 +# ------------------------------- diff --git a/test/test_script.jl b/test/test_script.jl index 5ce6563..613e744 100644 --- a/test/test_script.jl +++ b/test/test_script.jl @@ -1,101 +1,58 @@ -include("test_trixicuda.jl") +include("test_macros.jl") -equations = HyperbolicDiffusionEquations1D() +advection_velocity = (0.2, -0.7, 0.5) +equations = LinearScalarAdvectionEquation3D(advection_velocity) -initial_condition = initial_condition_poisson_nonperiodic +solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) -boundary_conditions = boundary_condition_poisson_nonperiodic +coordinates_min = (-1.0, -1.0, -1.0) +coordinates_max = (1.0, 1.0, 1.0) -solver = DGSEM(polydeg = 4, surface_flux = flux_lax_friedrichs) - -coordinates_min = 0.0 -coordinates_max = 1.0 mesh = TreeMesh(coordinates_min, coordinates_max, initial_refinement_level = 3, - n_cells_max = 30_000, - periodicity = false) + n_cells_max = 30_000) + +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test, + solver) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition_convergence_test, + solver) -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, - boundary_conditions = boundary_conditions, - source_terms = source_terms_poisson_nonperiodic) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, - boundary_conditions = boundary_conditions, - source_terms = source_terms_poisson_nonperiodic) +tspan = (0.0, 1.0) -tspan = (0.0, 5.0) +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) # Get CPU data +t = 0.0 (; mesh, equations, initial_condition, boundary_conditions, source_terms, solver, cache) = semi +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) # Get GPU data -mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +t_gpu = 0.0 equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache initial_condition_gpu = semi_gpu.initial_condition boundary_conditions_gpu = semi_gpu.boundary_conditions source_terms_gpu = semi_gpu.source_terms - -# Set initial time -t = t_gpu = 0.0 - -# Get initial data -ode = semidiscretize(semi, tspan) -u_ode = copy(ode.u0) -du_ode = similar(u_ode) -u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) -du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) +u_gpu = CuArray(u) +du_gpu = CuArray(du) # Copy data to device du_gpu, u_gpu = TrixiCUDA.copy_to_gpu!(du, u) # Reset data on host Trixi.reset_du!(du, solver, cache) -# Test `cuda_volume_integral!` -TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -@test_approx du_gpu ≈ du - -# Test `cuda_prolong2interfaces!` -TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx cache_gpu.interfaces.u ≈ cache.interfaces.u - -# Test `cuda_interface_flux!` -TrixiCUDA.cuda_interface_flux!(mesh_gpu, Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) -@test_approx cache_gpu.elements.surface_flux_values ≈ cache.elements.surface_flux_values - -# Test `cuda_prolong2boundaries!` -TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu, - cache_gpu) -Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx cache_gpu.boundaries.u ≈ cache.boundaries.u - -# Test `cuda_boundary_flux!` -TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), equations_gpu, - solver_gpu, cache_gpu) -Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) -@test_approx cache_gpu.elements.surface_flux_values ≈ cache.elements.surface_flux_values +derivative_dhat = CuArray{Float64}(solver_gpu.basis.derivative_dhat) +flux_arr1 = similar(u_gpu) +flux_arr2 = similar(u_gpu) +flux_arr3 = similar(u_gpu) -# Test `cuda_surface_integral!` -TrixiCUDA.cuda_surface_integral!(du_gpu, mesh_gpu, equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_surface_integral!(du, u, mesh, equations, solver.surface_integral, solver, cache) -@test_approx du_gpu ≈ du +size_arr = CuArray{Float64}(undef, size(u, 2)^3, size(u, 5)) -# Test `cuda_jacobian!` -TrixiCUDA.cuda_jacobian!(du_gpu, mesh_gpu, equations_gpu, cache_gpu) -Trixi.apply_jacobian!(du, mesh, equations, solver, cache) -@test_approx du_gpu ≈ du +flux_kernel = @cuda launch=false TrixiCUDA.flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u_gpu, + equations, + flux) -# Test `cuda_sources!` -TrixiCUDA.cuda_sources!(du_gpu, u_gpu, t_gpu, source_terms_gpu, equations_gpu, cache_gpu) -Trixi.calc_sources!(du, u, t, source_terms, equations, solver, cache) -@test_approx du_gpu ≈ du +config = launch_configuration(flux_kernel.fun)