From 34b86b73042e87d82f8bc2584ab04864c16a8b2d Mon Sep 17 00:00:00 2001 From: huiyuxie Date: Fri, 20 Dec 2024 19:03:21 -1000 Subject: [PATCH 1/3] Cooperative kernels --- benchmark.jl | 83 ++++++ src/auxiliary/configurators.jl | 81 +++++- src/solvers/dg_3d.jl | 462 +++++++++++++++++++++++---------- test.jl | 48 ++++ test2.jl | 97 +++++++ 5 files changed, 627 insertions(+), 144 deletions(-) create mode 100644 benchmark.jl create mode 100644 test.jl create mode 100644 test2.jl diff --git a/benchmark.jl b/benchmark.jl new file mode 100644 index 00000000..b401eca3 --- /dev/null +++ b/benchmark.jl @@ -0,0 +1,83 @@ +using Trixi, TrixiCUDA +using CUDA +using BenchmarkTools + +equations = CompressibleEulerEquations3D(1.4) + +initial_condition = initial_condition_convergence_test +solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) + +coordinates_min = (0.0, 0.0, 0.0) +coordinates_max = (2.0, 2.0, 2.0) +refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5), + coordinates_max = (1.5, 1.5, 1.5)),) +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 2, + refinement_patches = refinement_patches, + n_cells_max = 10_000) + +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, + source_terms = source_terms_convergence_test) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, + source_terms = source_terms_convergence_test) + +tspan = tspan_gpu = (0.0, 1.0) +t = t_gpu = 0.0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu = copy(ode_gpu.u0) +du_gpu = similar(u_gpu) + +# Semidiscretization process +# Reset du test +TrixiCUDA.reset_du!(du_gpu) + +# Volume integral test +TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) + +# Prolong to interfaces test +TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) + +# Interface flux test +TrixiCUDA.cuda_interface_flux!(mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) + +# Prolong to boundaries test +TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, + equations_gpu, cache_gpu) + +# Boundary flux test +TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) + +# Prolong to mortars test +@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, + TrixiCUDA.check_cache_mortars(cache_gpu), + solver_gpu, cache_gpu) + +# # Mortar flux test +# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), +# Trixi.have_nonconservative_terms(equations_gpu), +# equations_gpu, solver_gpu, cache_gpu) diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl index c2e04a75..813576a9 100644 --- a/src/auxiliary/configurators.jl +++ b/src/auxiliary/configurators.jl @@ -2,7 +2,7 @@ # blocks to be used in the kernel, which optimizes the use of GPU resources. # 1D kernel configurator -# We hardcode 32 threads per block for 1D kernels +# We hardcode 32 threads per block for 1D kernels. function kernel_configurator_1d(kernel::HostKernel, x::Int) # config = launch_configuration(kernel.fun) # not used in this case @@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int) return (threads = threads, blocks = blocks) end +# 1D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_1d(kernel::HostKernel, x::Int) + # config = launch_configuration(kernel.fun) # not used in this case + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32 + blocks = min(cld(x, threads), sm_count) + + return (threads = threads, blocks = blocks) +end + # 2D kernel configurator # We hardcode 32 threads for x dimension per block, and y dimension is determined -# by the number of threads returned by the launch configuration +# by the number of threads returned by the launch configuration. function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int) config = launch_configuration(kernel.fun) # get the number of threads @@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int) return (threads = threads, blocks = blocks) end +# 2D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int) + config = launch_configuration(kernel.fun) # get the number of threads + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y) + blocks_x = cld(x, threads[1]) + blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x)) + + blocks = (blocks_x, blocks_y) + + return (threads = threads, blocks = blocks) +end + # 3D kernel configurator # We hardcode 32 threads for x dimension per block, y and z dimensions are determined -# by the number of threads returned by the launch configuration +# by the number of threads returned by the launch configuration. function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int) config = launch_configuration(kernel.fun) # get the number of threads @@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int) return (threads = threads, blocks = blocks) end +# 3D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int) + config = launch_configuration(kernel.fun) # get the number of threads + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) + + # z dimension + dims_z1 = cld(x * y * z, 32 * dims_y) + dims_z2 = max(fld(config.threads, 32 * dims_y), 1) + + dims_z = min(dims_z1, dims_z2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y, dims_z) + blocks_x = cld(x, threads[1]) + blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x)) + blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y)) + + blocks = (blocks_x, blocks_y, blocks_z) + + return (threads = threads, blocks = blocks) +end + # Deprecated old kernel configurators below # function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1}) diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index e14df20f..54ed7be5 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -890,126 +890,248 @@ function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lowe return nothing end -# Kernel for interpolating data large to small on mortars - step 1 -function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, u, forward_upper, - forward_lower, neighbor_ids, large_sides, orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && - k <= size(tmp_upper_left, 5)) - u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - large_side = large_sides[k] - orientation = orientations[k] - large_element = neighbor_ids[5, k] - - leftright = large_side - - @inbounds begin - for j1j1 in axes(forward_lower, 2) - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - end - - for j1j1 in axes(forward_lower, 2) - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - end - end - end - - return nothing -end - -# Kernel for interpolating data large to small on mortars - step 2 -function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, - u_lower_right, tmp_upper_left, tmp_upper_right, - tmp_lower_left, tmp_lower_right, forward_upper, - forward_lower, large_sides) +# # Kernel for interpolating data large to small on mortars - step 1 +# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, u, forward_upper, +# forward_lower, neighbor_ids, large_sides, orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && +# k <= size(tmp_upper_left, 5)) +# u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# large_side = large_sides[k] +# orientation = orientations[k] +# large_element = neighbor_ids[5, k] + +# leftright = large_side + +# @inbounds begin +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) +# end + +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) +# end +# end +# end + +# return nothing +# end + +# # Kernel for interpolating data large to small on mortars - step 2 +# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, +# u_lower_right, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, forward_upper, +# forward_lower, large_sides) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && +# k <= size(u_upper_left, 5)) +# u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# leftright = large_sides[k] + +# @inbounds begin +# for j2j2 in axes(forward_upper, 2) +# u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_left[leftright, i, j1, j2j2, k] + +# u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_right[leftright, i, j1, j2j2, k] + +# u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_left[leftright, i, j1, j2j2, k] + +# u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_right[leftright, i, j1, j2j2, k] +# end +# end +# end + +# return nothing +# end + +# Kernel for interpolating data large to small on mortars +function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, + u, forward_upper, forward_lower, neighbor_ids, large_sides, + orientations) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && - k <= size(u_upper_left, 5)) - u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - leftright = large_sides[k] - - @inbounds begin - for j2j2 in axes(forward_upper, 2) - u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_left[leftright, i, j1, j2j2, k] - - u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_right[leftright, i, j1, j2j2, k] - - u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_left[leftright, i, j1, j2j2, k] - - u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_right[leftright, i, j1, j2j2, k] + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z + + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(tmp_upper_left, 2) + while j <= size(tmp_upper_left, 3)^2 + while k <= size(tmp_upper_left, 5) + u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + large_side = large_sides[k] + orientation = orientations[k] + large_element = neighbor_ids[5, k] + + leftright = large_side + + @inbounds begin + for j1j1 in axes(forward_lower, 2) + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + end + + for j1j1 in axes(forward_lower, 2) + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + end + + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) + + for j2j2 in axes(forward_upper, 2) + u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_left[leftright, i, j1, j2j2, k] + + u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_right[leftright, i, j1, j2j2, k] + + u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_left[leftright, i, j1, j2j2, k] + + u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_right[leftright, i, j1, j2j2, k] + end + end + k += stride_z end + j += stride_y end + i += stride_x end return nothing @@ -1857,7 +1979,81 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D @assert iszero(length(cache.mortars.orientations)) end -# Pack kernels for prolonging solution to mortars +# # Pack kernels for prolonging solution to mortars +# function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache) +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# forward_upper = CuArray(dg.mortar.forward_upper) +# forward_lower = CuArray(dg.mortar.forward_lower) + +# prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper_left, +# u_upper_right, +# u_lower_left, +# u_lower_right, +# u, +# neighbor_ids, +# large_sides, +# orientations) +# prolong_mortars_small2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, u, +# neighbor_ids, large_sides, orientations; +# kernel_configurator_3d(prolong_mortars_small2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# tmp_upper_left = zero(similar(u_upper_left)) # undef to zero +# tmp_upper_right = zero(similar(u_upper_right)) # undef to zero +# tmp_lower_left = zero(similar(u_lower_left)) # undef to zero +# tmp_lower_right = zero(similar(u_lower_right)) # undef to zero + +# prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# u, +# forward_upper, +# forward_lower, +# neighbor_ids, +# large_sides, +# orientations) +# prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, u, forward_upper, forward_lower, +# neighbor_ids, large_sides, orientations; +# kernel_configurator_3d(prolong_mortars_large2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left, +# u_upper_right, +# u_lower_left, +# u_lower_right, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# forward_upper, +# forward_lower, +# large_sides) +# prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, +# tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, forward_upper, forward_lower, large_sides; +# kernel_configurator_3d(prolong_mortars_large2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# return nothing +# end + +# Pack kernels for prolonging solution to mortars (optimized) function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache) neighbor_ids = cache.mortars.neighbor_ids large_sides = cache.mortars.large_sides @@ -1891,25 +2087,6 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG tmp_lower_left = zero(similar(u_lower_left)) # undef to zero tmp_lower_right = zero(similar(u_lower_right)) # undef to zero - # TODO: Combine these two kernels into one (synchronization) - prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - u, - forward_upper, - forward_lower, - neighbor_ids, - large_sides, - orientations) - prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, u, forward_upper, forward_lower, - neighbor_ids, large_sides, orientations; - kernel_configurator_3d(prolong_mortars_large2small_kernel, - size(u_upper_left, 2), - size(u_upper_left, 3)^2, - size(u_upper_left, 5))...) - prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, @@ -1918,16 +2095,19 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG tmp_upper_right, tmp_lower_left, tmp_lower_right, - forward_upper, + u, forward_upper, forward_lower, - large_sides) + neighbor_ids, + large_sides, + orientations) prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, forward_upper, forward_lower, large_sides; - kernel_configurator_3d(prolong_mortars_large2small_kernel, - size(u_upper_left, 2), - size(u_upper_left, 3)^2, - size(u_upper_left, 5))...) + tmp_lower_right, u, forward_upper, forward_lower, neighbor_ids, + large_sides, orientations; cooperative = true, + kernel_configurator_coop_3d(prolong_mortars_large2small_kernel, + size(u_upper_left, 2), + size(u_upper_left, 3)^2, + size(u_upper_left, 5))...) return nothing end diff --git a/test.jl b/test.jl new file mode 100644 index 00000000..95ad959f --- /dev/null +++ b/test.jl @@ -0,0 +1,48 @@ +# using CUDA + +# # Kernel using CG.this_grid() for grid-level synchronization +# function simple_kernel(data) +# idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + +# # Simple operation: increment each element +# if idx <= length(data) +# data[idx] += 1.0f0 +# end + +# grid = CG.this_grid() +# CG.sync(grid) + +# return nothing +# end + +# # Host code +# function main() +# n = 1024 +# data = CUDA.fill(Float32(1.0), n) # Initialize array with 1.0s + +# # Kernel launch configuration +# threads_per_block = 10 +# num_blocks = ceil(Int, n / threads_per_block) + +# println("num_blocks: ", num_blocks) + +# # Launch kernel with cooperative launch enabled +# kernel = @cuda launch=false simple_kernel(data) + +# kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true) + +# # Fetch results back to host +# result = Array(data) +# end + +# main() + +using CUDA + +function query_max_threads_per_block() + device = CUDA.device() # Get the current CUDA device + max_threads = CUDA.capability(device).max_threads_per_block # Query max threads per block + println("Maximum threads per block: $max_threads") +end + +query_max_threads_per_block() diff --git a/test2.jl b/test2.jl new file mode 100644 index 00000000..ab044f24 --- /dev/null +++ b/test2.jl @@ -0,0 +1,97 @@ +using Trixi, TrixiCUDA +using CUDA +using Test +using BenchmarkTools + +include("test/test_macros.jl") + +advection_velocity = (0.2, -0.7, 0.5) +equations = LinearScalarAdvectionEquation3D(advection_velocity) + +initial_condition = initial_condition_convergence_test +solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) + +coordinates_min = (-1.0, -1.0, -1.0) +coordinates_max = (1.0, 1.0, 1.0) +refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0), + coordinates_max = (1.0, 1.0, 1.0)), + (type = "box", coordinates_min = (0.0, -0.5, -0.5), + coordinates_max = (0.5, 0.5, 0.5))) +mesh = TreeMesh(coordinates_min, coordinates_max, + initial_refinement_level = 2, + refinement_patches = refinement_patches, + n_cells_max = 10_000) + +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) + +tspan = tspan_gpu = (0.0, 5.0) +t = t_gpu = 0.0 + +# Semi on CPU +(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi + +# Semi on GPU +equations_gpu = semi_gpu.equations +mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache +boundary_conditions_gpu = semi_gpu.boundary_conditions +source_terms_gpu = semi_gpu.source_terms + +# ODE on CPU +ode = semidiscretize(semi, tspan) +u_ode = copy(ode.u0) +du_ode = similar(u_ode) +u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) +du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) + +# ODE on GPU +ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) +u_gpu = copy(ode_gpu.u0) +du_gpu = similar(u_gpu) + +# Tests for components initialization +@test_approx (u_gpu, u) +# du is initlaizaed as undefined, cannot test now + +# Tests for semidiscretization process +TrixiCUDA.reset_du!(du_gpu) +Trixi.reset_du!(du, solver, cache) +@test_approx (du_gpu, du) + +TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu.volume_integral, solver_gpu, + cache_gpu) +Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), + equations, solver.volume_integral, solver, cache) +@test_approx (du_gpu, du) + +TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) +Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) +@test_approx (cache_gpu.interfaces.u, cache.interfaces.u) + +TrixiCUDA.cuda_interface_flux!(mesh_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) +Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, + Trixi.have_nonconservative_terms(equations), equations, + solver.surface_integral, solver, cache) +@test_approx (cache_gpu.elements.surface_flux_values, + cache.elements.surface_flux_values) + +TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, + equations_gpu, cache_gpu) +Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) +@test_approx (cache_gpu.boundaries.u, cache.boundaries.u) + +TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) +Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, + solver.surface_integral, solver) +@test_approx (cache_gpu.elements.surface_flux_values, + cache.elements.surface_flux_values) + +TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, + TrixiCUDA.check_cache_mortars(cache_gpu), + solver_gpu, cache_gpu) From 0a454c9c6ef885eaf72afa9d3598920645909b3e Mon Sep 17 00:00:00 2001 From: huiyuxie Date: Fri, 20 Dec 2024 22:18:47 -1000 Subject: [PATCH 2/3] Complete --- benchmark.jl | 38 +-- src/solvers/dg_2d.jl | 12 +- src/solvers/dg_3d.jl | 607 +++++++++++++++++++++++++++++++------------ test.jl | 48 ---- test2.jl | 97 ------- 5 files changed, 472 insertions(+), 330 deletions(-) delete mode 100644 test.jl delete mode 100644 test2.jl diff --git a/benchmark.jl b/benchmark.jl index b401eca3..36a1c228 100644 --- a/benchmark.jl +++ b/benchmark.jl @@ -2,24 +2,27 @@ using Trixi, TrixiCUDA using CUDA using BenchmarkTools -equations = CompressibleEulerEquations3D(1.4) +equations = IdealGlmMhdEquations3D(5 / 3) initial_condition = initial_condition_convergence_test -solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) -coordinates_min = (0.0, 0.0, 0.0) -coordinates_max = (2.0, 2.0, 2.0) -refinement_patches = ((type = "box", coordinates_min = (0.5, 0.5, 0.5), - coordinates_max = (1.5, 1.5, 1.5)),) +volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) +solver = DGSEM(polydeg = 3, + surface_flux = (flux_hlle, + flux_nonconservative_powell), + volume_integral = VolumeIntegralFluxDifferencing(volume_flux)) + +coordinates_min = (-1.0, -1.0, -1.0) +coordinates_max = (1.0, 1.0, 1.0) +refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5), + coordinates_max = (0.5, 0.5, 0.5)),) mesh = TreeMesh(coordinates_min, coordinates_max, initial_refinement_level = 2, refinement_patches = refinement_patches, n_cells_max = 10_000) -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, - source_terms = source_terms_convergence_test) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver, - source_terms = source_terms_convergence_test) +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) +semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) tspan = tspan_gpu = (0.0, 1.0) t = t_gpu = 0.0 @@ -73,11 +76,10 @@ TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, equations_gpu, solver_gpu, cache_gpu) # Prolong to mortars test -@benchmark CUDA.@sync TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) - -# # Mortar flux test -# @benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), -# Trixi.have_nonconservative_terms(equations_gpu), -# equations_gpu, solver_gpu, cache_gpu) +TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, + TrixiCUDA.check_cache_mortars(cache_gpu), + solver_gpu, cache_gpu) + +@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), + Trixi.have_nonconservative_terms(equations_gpu), + equations_gpu, solver_gpu, cache_gpu) diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index de0643eb..e7cae9f8 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{2}) + orientations) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z @@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), @@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 54ed7be5..8bdf8468 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -1272,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right end # # Kernel for copying mortar fluxes small to small and small to large - step 1 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, - tmp_lower_left, tmp_lower_right, +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, +# fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# lower_left_element = neighbor_ids[1, k] +# lower_right_element = neighbor_ids[2, k] +# upper_left_element = neighbor_ids[3, k] +# upper_right_element = neighbor_ids[4, k] +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 5 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 6`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation + large_side - 2 + +# surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 6 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 5`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j1j1 in axes(reverse_upper, 2) +# tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_upper_left[i, j1j1, j2, k] +# tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_upper_right[i, j1j1, j2, k] +# tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_lower_left[i, j1j1, j2, k] +# tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_lower_right[i, j1j1, j2, k] +# end +# end +# end + +# return nothing +# end + +# # Kernel for copying mortar fluxes small to small and small to large - step 2 +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations, equations::AbstractEquations{3}) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # See step 1 for the explanation of the following expression +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j2j2 in axes(reverse_lower, 2) +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_right[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_right[i, j1, j2j2, +# direction, +# large_element] +# end + +# surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, +# direction, +# large_element] +# end +# end + +# return nothing +# end + +# Kernel for copying mortar fluxes small to small and small to large +function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, fstar_secondary_upper_right, @@ -1284,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - lower_left_element = neighbor_ids[1, k] - lower_right_element = neighbor_ids[2, k] - upper_left_element = neighbor_ids[3, k] - upper_right_element = neighbor_ids[4, k] - large_element = neighbor_ids[5, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + - # isequal(large_side, 1) * isequal(orientation, 2) * 3 + - # isequal(large_side, 1) * isequal(orientation, 3) * 5 + - # isequal(large_side, 2) * isequal(orientation, 1) * 2 + - # isequal(large_side, 2) * isequal(orientation, 2) * 4 + - # isequal(large_side, 2) * isequal(orientation, 3) * 6`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation + large_side - 2 - - surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + - # isequal(large_side, 1) * isequal(orientation, 2) * 4 + - # isequal(large_side, 1) * isequal(orientation, 3) * 6 + - # isequal(large_side, 2) * isequal(orientation, 1) * 1 + - # isequal(large_side, 2) * isequal(orientation, 2) * 3 + - # isequal(large_side, 2) * isequal(orientation, 3) * 5`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation - large_side + 1 + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z - @inbounds begin - for j1j1 in axes(reverse_upper, 2) - tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_upper_left[i, j1j1, j2, k] - tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_upper_right[i, j1j1, j2, k] - tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_lower_left[i, j1j1, j2, k] - tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_lower_right[i, j1j1, j2, k] - end - end - end + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(surface_flux_values, 1) + while j <= size(surface_flux_values, 2)^2 + while k <= length(orientations) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + lower_left_element = neighbor_ids[1, k] + lower_right_element = neighbor_ids[2, k] + upper_left_element = neighbor_ids[3, k] + upper_right_element = neighbor_ids[4, k] + large_element = neighbor_ids[5, k] - return nothing -end + large_side = large_sides[k] + orientation = orientations[k] -# Kernel for copying mortar fluxes small to small and small to large - step 2 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + + # isequal(large_side, 1) * isequal(orientation, 2) * 3 + + # isequal(large_side, 1) * isequal(orientation, 3) * 5 + + # isequal(large_side, 2) * isequal(orientation, 1) * 2 + + # isequal(large_side, 2) * isequal(orientation, 2) * 4 + + # isequal(large_side, 2) * isequal(orientation, 3) * 6`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation + large_side - 2 + + surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + + # isequal(large_side, 1) * isequal(orientation, 2) * 4 + + # isequal(large_side, 1) * isequal(orientation, 3) * 6 + + # isequal(large_side, 2) * isequal(orientation, 1) * 1 + + # isequal(large_side, 2) * isequal(orientation, 2) * 3 + + # isequal(large_side, 2) * isequal(orientation, 3) * 5`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation - large_side + 1 - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + @inbounds begin + for j1j1 in axes(reverse_upper, 2) + tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_upper_left[i, j1j1, j2, k] + tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_upper_right[i, j1j1, j2, k] + tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_lower_left[i, j1j1, j2, k] + tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_lower_right[i, j1j1, j2, k] + end + end - large_element = neighbor_ids[5, k] + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) - large_side = large_sides[k] - orientation = orientations[k] - - # See step 1 for the explanation of the following expression - direction = 2 * orientation - large_side + 1 + @inbounds begin + for j2j2 in axes(reverse_lower, 2) + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_right[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_right[i, j1, j2j2, + direction, + large_element] + end - @inbounds begin - for j2j2 in axes(reverse_lower, 2) - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_right[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_right[i, j1, j2j2, - direction, - large_element] + surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, + direction, + large_element] + end + k += stride_z end - - surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, - direction, - large_element] + j += stride_y end + i += stride_x end return nothing @@ -2118,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat @assert iszero(length(cache.mortars.orientations)) end -# Pack kernels for calculating mortar fluxes +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, +# equations, dg::DGSEM, cache) +# surface_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, equations, +# surface_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# equations, surface_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + +# Pack kernels for calculating mortar fluxes (optimized) function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, equations, dg::DGSEM, cache) surface_flux = dg.surface_integral.surface_flux @@ -2174,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2193,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, +# equations, dg::DGSEM, cache) +# surface_flux, nonconservative_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, large_sides, +# equations, surface_flux, +# nonconservative_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# large_sides, equations, surface_flux, nonconservative_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + # Pack kernels for calculating mortar fluxes function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, equations, dg::DGSEM, cache) @@ -2285,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2304,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end diff --git a/test.jl b/test.jl deleted file mode 100644 index 95ad959f..00000000 --- a/test.jl +++ /dev/null @@ -1,48 +0,0 @@ -# using CUDA - -# # Kernel using CG.this_grid() for grid-level synchronization -# function simple_kernel(data) -# idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x - -# # Simple operation: increment each element -# if idx <= length(data) -# data[idx] += 1.0f0 -# end - -# grid = CG.this_grid() -# CG.sync(grid) - -# return nothing -# end - -# # Host code -# function main() -# n = 1024 -# data = CUDA.fill(Float32(1.0), n) # Initialize array with 1.0s - -# # Kernel launch configuration -# threads_per_block = 10 -# num_blocks = ceil(Int, n / threads_per_block) - -# println("num_blocks: ", num_blocks) - -# # Launch kernel with cooperative launch enabled -# kernel = @cuda launch=false simple_kernel(data) - -# kernel(data; blocks=num_blocks, threads=threads_per_block, cooperative=true) - -# # Fetch results back to host -# result = Array(data) -# end - -# main() - -using CUDA - -function query_max_threads_per_block() - device = CUDA.device() # Get the current CUDA device - max_threads = CUDA.capability(device).max_threads_per_block # Query max threads per block - println("Maximum threads per block: $max_threads") -end - -query_max_threads_per_block() diff --git a/test2.jl b/test2.jl deleted file mode 100644 index ab044f24..00000000 --- a/test2.jl +++ /dev/null @@ -1,97 +0,0 @@ -using Trixi, TrixiCUDA -using CUDA -using Test -using BenchmarkTools - -include("test/test_macros.jl") - -advection_velocity = (0.2, -0.7, 0.5) -equations = LinearScalarAdvectionEquation3D(advection_velocity) - -initial_condition = initial_condition_convergence_test -solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs) - -coordinates_min = (-1.0, -1.0, -1.0) -coordinates_max = (1.0, 1.0, 1.0) -refinement_patches = ((type = "box", coordinates_min = (0.0, -1.0, -1.0), - coordinates_max = (1.0, 1.0, 1.0)), - (type = "box", coordinates_min = (0.0, -0.5, -0.5), - coordinates_max = (0.5, 0.5, 0.5))) -mesh = TreeMesh(coordinates_min, coordinates_max, - initial_refinement_level = 2, - refinement_patches = refinement_patches, - n_cells_max = 10_000) - -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) - -tspan = tspan_gpu = (0.0, 5.0) -t = t_gpu = 0.0 - -# Semi on CPU -(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi - -# Semi on GPU -equations_gpu = semi_gpu.equations -mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache -boundary_conditions_gpu = semi_gpu.boundary_conditions -source_terms_gpu = semi_gpu.source_terms - -# ODE on CPU -ode = semidiscretize(semi, tspan) -u_ode = copy(ode.u0) -du_ode = similar(u_ode) -u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) -du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) - -# ODE on GPU -ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) -u_gpu = copy(ode_gpu.u0) -du_gpu = similar(u_gpu) - -# Tests for components initialization -@test_approx (u_gpu, u) -# du is initlaizaed as undefined, cannot test now - -# Tests for semidiscretization process -TrixiCUDA.reset_du!(du_gpu) -Trixi.reset_du!(du, solver, cache) -@test_approx (du_gpu, du) - -TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) -Trixi.calc_volume_integral!(du, u, mesh, Trixi.have_nonconservative_terms(equations), - equations, solver.volume_integral, solver, cache) -@test_approx (du_gpu, du) - -TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) -Trixi.prolong2interfaces!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx (cache_gpu.interfaces.u, cache.interfaces.u) - -TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_interface_flux!(cache.elements.surface_flux_values, mesh, - Trixi.have_nonconservative_terms(equations), equations, - solver.surface_integral, solver, cache) -@test_approx (cache_gpu.elements.surface_flux_values, - cache.elements.surface_flux_values) - -TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) -Trixi.prolong2boundaries!(cache, u, mesh, equations, solver.surface_integral, solver) -@test_approx (cache_gpu.boundaries.u, cache.boundaries.u) - -TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) -Trixi.calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations, - solver.surface_integral, solver) -@test_approx (cache_gpu.elements.surface_flux_values, - cache.elements.surface_flux_values) - -TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) From a01f7a3923854fcc3358cc54001d3466299e00ae Mon Sep 17 00:00:00 2001 From: huiyuxie Date: Fri, 20 Dec 2024 22:29:34 -1000 Subject: [PATCH 3/3] Remove benchmark --- benchmark.jl | 85 ---------------------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 benchmark.jl diff --git a/benchmark.jl b/benchmark.jl deleted file mode 100644 index 36a1c228..00000000 --- a/benchmark.jl +++ /dev/null @@ -1,85 +0,0 @@ -using Trixi, TrixiCUDA -using CUDA -using BenchmarkTools - -equations = IdealGlmMhdEquations3D(5 / 3) - -initial_condition = initial_condition_convergence_test - -volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell) -solver = DGSEM(polydeg = 3, - surface_flux = (flux_hlle, - flux_nonconservative_powell), - volume_integral = VolumeIntegralFluxDifferencing(volume_flux)) - -coordinates_min = (-1.0, -1.0, -1.0) -coordinates_max = (1.0, 1.0, 1.0) -refinement_patches = ((type = "box", coordinates_min = (-0.5, -0.5, -0.5), - coordinates_max = (0.5, 0.5, 0.5)),) -mesh = TreeMesh(coordinates_min, coordinates_max, - initial_refinement_level = 2, - refinement_patches = refinement_patches, - n_cells_max = 10_000) - -semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver) -semi_gpu = SemidiscretizationHyperbolicGPU(mesh, equations, initial_condition, solver) - -tspan = tspan_gpu = (0.0, 1.0) -t = t_gpu = 0.0 - -# Semi on CPU -(; mesh, equations, boundary_conditions, source_terms, solver, cache) = semi - -# Semi on GPU -equations_gpu = semi_gpu.equations -mesh_gpu, solver_gpu, cache_gpu = semi_gpu.mesh, semi_gpu.solver, semi_gpu.cache -boundary_conditions_gpu = semi_gpu.boundary_conditions -source_terms_gpu = semi_gpu.source_terms - -# ODE on CPU -ode = semidiscretize(semi, tspan) -u_ode = copy(ode.u0) -du_ode = similar(u_ode) -u = Trixi.wrap_array(u_ode, mesh, equations, solver, cache) -du = Trixi.wrap_array(du_ode, mesh, equations, solver, cache) - -# ODE on GPU -ode_gpu = semidiscretizeGPU(semi_gpu, tspan_gpu) -u_gpu = copy(ode_gpu.u0) -du_gpu = similar(u_gpu) - -# Semidiscretization process -# Reset du test -TrixiCUDA.reset_du!(du_gpu) - -# Volume integral test -TrixiCUDA.cuda_volume_integral!(du_gpu, u_gpu, mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu.volume_integral, solver_gpu, - cache_gpu) - -# Prolong to interfaces test -TrixiCUDA.cuda_prolong2interfaces!(u_gpu, mesh_gpu, equations_gpu, cache_gpu) - -# Interface flux test -TrixiCUDA.cuda_interface_flux!(mesh_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) - -# Prolong to boundaries test -TrixiCUDA.cuda_prolong2boundaries!(u_gpu, mesh_gpu, boundary_conditions_gpu, - equations_gpu, cache_gpu) - -# Boundary flux test -TrixiCUDA.cuda_boundary_flux!(t_gpu, mesh_gpu, boundary_conditions_gpu, - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu) - -# Prolong to mortars test -TrixiCUDA.cuda_prolong2mortars!(u_gpu, mesh_gpu, - TrixiCUDA.check_cache_mortars(cache_gpu), - solver_gpu, cache_gpu) - -@benchmark CUDA.@sync TrixiCUDA.cuda_mortar_flux!(mesh_gpu, TrixiCUDA.check_cache_mortars(cache_gpu), - Trixi.have_nonconservative_terms(equations_gpu), - equations_gpu, solver_gpu, cache_gpu)