diff --git a/benchmark.jl b/benchmark.jl index b401eca..36a1c22 100644 --- a/benchmark.jl +++ b/benchmark.jl @@ -2,24 +2,27 @@ using Trixi, TrixiCUDA using CUDA using BenchmarkTools -equations = CompressibleEulerEquations3D(1.4) +equations = IdealGlmMhdEquations3D(5 / 3) initial_condition = initial_condition_convergence_test -solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) -coordinates_min = (0.0, 0.0, 0.0) -coordinates_max = (2.0, 2.0, 2.0) -refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5), - coordinates_max = (1.5, 1.5, 1.5)),) +volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) +solver = DGSEM(polydeg = 3, + surface_flux = (flux_hlle, + flux_nonconservative_powell), + volume_integral = VolumeIntegralFluxDifferencing(volume_flux)) + +coordinates_min = (-1.0, -1.0, -1.0) +coordinates_max = (1.0, 1.0, 1.0) +refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5), + coordinates_max = (0.5, 0.5, 0.5)),) mesh = TreeMesh(coordinates_min, coordinates_max, initial_refinement_level = 2, refinement_patches = refinement_patches, n_cells_max = 10_000) -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, - source_terms = source_terms_convergence_test) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, - source_terms = source_terms_convergence_test) +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) tspan = tspan_gpu = (0.0, 1.0) t = t_gpu = 0.0 @@ -73,11 +76,10 @@ TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu, solver_gpu, cache_gpu) # Prolong to mortars test -@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) - -# # Mortar flux test -# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), -# Trixi.have_nonconservative_terms(equations_gpu), -# equations_gpu, solver_gpu, cache_gpu) +TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, + TrixiCUDA.check_cache_mortars(cache_gpu), + solver_gpu, cache_gpu) + +@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index de0643e..e7cae9f 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{2}) + orientations) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z @@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), @@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 54ed7be..8bdf846 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -1272,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right end # # Kernel for copying mortar fluxes small to small and small to large - step 1 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, - tmp_lower_left, tmp_lower_right, +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, +# fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# lower_left_element = neighbor_ids[1, k] +# lower_right_element = neighbor_ids[2, k] +# upper_left_element = neighbor_ids[3, k] +# upper_right_element = neighbor_ids[4, k] +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 5 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 6`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation + large_side - 2 + +# surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 6 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 5`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j1j1 in axes(reverse_upper, 2) +# tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_upper_left[i, j1j1, j2, k] +# tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_upper_right[i, j1j1, j2, k] +# tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_lower_left[i, j1j1, j2, k] +# tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_lower_right[i, j1j1, j2, k] +# end +# end +# end + +# return nothing +# end + +# # Kernel for copying mortar fluxes small to small and small to large - step 2 +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations, equations::AbstractEquations{3}) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # See step 1 for the explanation of the following expression +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j2j2 in axes(reverse_lower, 2) +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_right[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_right[i, j1, j2j2, +# direction, +# large_element] +# end + +# surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, +# direction, +# large_element] +# end +# end + +# return nothing +# end + +# Kernel for copying mortar fluxes small to small and small to large +function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, fstar_secondary_upper_right, @@ -1284,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - lower_left_element = neighbor_ids[1, k] - lower_right_element = neighbor_ids[2, k] - upper_left_element = neighbor_ids[3, k] - upper_right_element = neighbor_ids[4, k] - large_element = neighbor_ids[5, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + - # isequal(large_side, 1) * isequal(orientation, 2) * 3 + - # isequal(large_side, 1) * isequal(orientation, 3) * 5 + - # isequal(large_side, 2) * isequal(orientation, 1) * 2 + - # isequal(large_side, 2) * isequal(orientation, 2) * 4 + - # isequal(large_side, 2) * isequal(orientation, 3) * 6`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation + large_side - 2 - - surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + - # isequal(large_side, 1) * isequal(orientation, 2) * 4 + - # isequal(large_side, 1) * isequal(orientation, 3) * 6 + - # isequal(large_side, 2) * isequal(orientation, 1) * 1 + - # isequal(large_side, 2) * isequal(orientation, 2) * 3 + - # isequal(large_side, 2) * isequal(orientation, 3) * 5`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation - large_side + 1 + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z - @inbounds begin - for j1j1 in axes(reverse_upper, 2) - tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_upper_left[i, j1j1, j2, k] - tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_upper_right[i, j1j1, j2, k] - tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_lower_left[i, j1j1, j2, k] - tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_lower_right[i, j1j1, j2, k] - end - end - end + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(surface_flux_values, 1) + while j <= size(surface_flux_values, 2)^2 + while k <= length(orientations) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + lower_left_element = neighbor_ids[1, k] + lower_right_element = neighbor_ids[2, k] + upper_left_element = neighbor_ids[3, k] + upper_right_element = neighbor_ids[4, k] + large_element = neighbor_ids[5, k] - return nothing -end + large_side = large_sides[k] + orientation = orientations[k] -# Kernel for copying mortar fluxes small to small and small to large - step 2 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + + # isequal(large_side, 1) * isequal(orientation, 2) * 3 + + # isequal(large_side, 1) * isequal(orientation, 3) * 5 + + # isequal(large_side, 2) * isequal(orientation, 1) * 2 + + # isequal(large_side, 2) * isequal(orientation, 2) * 4 + + # isequal(large_side, 2) * isequal(orientation, 3) * 6`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation + large_side - 2 + + surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + + # isequal(large_side, 1) * isequal(orientation, 2) * 4 + + # isequal(large_side, 1) * isequal(orientation, 3) * 6 + + # isequal(large_side, 2) * isequal(orientation, 1) * 1 + + # isequal(large_side, 2) * isequal(orientation, 2) * 3 + + # isequal(large_side, 2) * isequal(orientation, 3) * 5`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation - large_side + 1 - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + @inbounds begin + for j1j1 in axes(reverse_upper, 2) + tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_upper_left[i, j1j1, j2, k] + tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_upper_right[i, j1j1, j2, k] + tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_lower_left[i, j1j1, j2, k] + tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_lower_right[i, j1j1, j2, k] + end + end - large_element = neighbor_ids[5, k] + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) - large_side = large_sides[k] - orientation = orientations[k] - - # See step 1 for the explanation of the following expression - direction = 2 * orientation - large_side + 1 + @inbounds begin + for j2j2 in axes(reverse_lower, 2) + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_right[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_right[i, j1, j2j2, + direction, + large_element] + end - @inbounds begin - for j2j2 in axes(reverse_lower, 2) - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_right[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_right[i, j1, j2j2, - direction, - large_element] + surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, + direction, + large_element] + end + k += stride_z end - - surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, - direction, - large_element] + j += stride_y end + i += stride_x end return nothing @@ -2118,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat @assert iszero(length(cache.mortars.orientations)) end -# Pack kernels for calculating mortar fluxes +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, +# equations, dg::DGSEM, cache) +# surface_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, equations, +# surface_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# equations, surface_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + +# Pack kernels for calculating mortar fluxes (optimized) function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, equations, dg::DGSEM, cache) surface_flux = dg.surface_integral.surface_flux @@ -2174,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2193,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, +# equations, dg::DGSEM, cache) +# surface_flux, nonconservative_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, large_sides, +# equations, surface_flux, +# nonconservative_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# large_sides, equations, surface_flux, nonconservative_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + # Pack kernels for calculating mortar fluxes function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, equations, dg::DGSEM, cache) @@ -2285,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2304,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end diff --git a/test.jl b/test.jl deleted file mode 100644 index 95ad959..0000000 --- a/test.jl +++ /dev/null @@ -1,48 +0,0 @@ -# using CUDA - -# # Kernel using CG.this_grid() for grid-level synchronization -# function simple_kernel(data) -# idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - -# # Simple operation: increment each element -# if idx <= length(data) -# data[idx] += 1.0f0 -# end - -# grid = CG.this_grid() -# CG.sync(grid) - -# return nothing -# end - -# # Host code -# function main() -# n = 1024 -# data = CUDA.fill(Float32(1.0), n) # Initialize array with 1.0s - -# # Kernel launch configuration -# threads_per_block = 10 -# num_blocks = ceil(Int, n / threads_per_block) - -# println("num_blocks: ", num_blocks) - -# # Launch kernel with cooperative launch enabled -# kernel = @cuda launch=false simple_kernel(data) - -# kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true) - -# # Fetch results back to host -# result = Array(data) -# end - -# main() - -using CUDA - -function query_max_threads_per_block() - device = CUDA.device() # Get the current CUDA device - max_threads = CUDA.capability(device).max_threads_per_block # Query max threads per block - println("Maximum threads per block: $max_threads") -end - -query_max_threads_per_block() diff --git a/test2.jl b/test2.jl deleted file mode 100644 index ab044f2..0000000 --- a/test2.jl +++ /dev/null @@ -1,97 +0,0 @@ -using Trixi, TrixiCUDA -using CUDA -using Test -using BenchmarkTools - -include("test/test_macros.jl") - -advection_velocity = (0.2, -0.7, 0.5) -equations = LinearScalarAdvectionEquation3D(advection_velocity) - -initial_condition = initial_condition_convergence_test -solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) - -coordinates_min = (-1.0, -1.0, -1.0) -coordinates_max = (1.0, 1.0, 1.0) -refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0), - coordinates_max = (1.0, 1.0, 1.0)), - (type = "box", coordinates_min = (0.0, -0.5, -0.5), - coordinates_max = (0.5, 0.5, 0.5))) -mesh = TreeMesh(coordinates_min, coordinates_max, - initial_refinement_level = 2, - refinement_patches = refinement_patches, - n_cells_max = 10_000) - -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) - -tspan = tspan_gpu = (0.0, 5.0) -t = t_gpu = 0.0 - -# Semi on CPU -(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi - -# Semi on GPU -equations_gpu = semi_gpu.equations -mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache -boundary_conditions_gpu = semi_gpu.boundary_conditions -source_terms_gpu = semi_gpu.source_terms - -# ODE on CPU -ode = semidiscretize(semi, tspan) -u_ode = copy(ode.u0) -du_ode = similar(u_ode) -u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) -du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) - -# ODE on GPU -ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) -u_gpu = copy(ode_gpu.u0) -du_gpu = similar(u_gpu) - -# Tests for components initialization -@test_approx (u_gpu, u) -# du is initlaizaed as undefined, cannot test now - -# Tests for semidiscretization process -TrixiCUDA.reset_du!(du_gpu) -Trixi.reset_du!(du, solver, cache) -@test_approx (du_gpu, du) - -TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -@test_approx (du_gpu, du) - -TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx (cache_gpu.interfaces.u, cache.interfaces.u) - -TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) -@test_approx (cache_gpu.elements.surface_flux_values, - cache.elements.surface_flux_values) - -TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) -Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx (cache_gpu.boundaries.u, cache.boundaries.u) - -TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) -@test_approx (cache_gpu.elements.surface_flux_values, - cache.elements.surface_flux_values) - -TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu)