From ccb6d678b3896f1a464e781642186ab1d0818f80 Mon Sep 17 00:00:00 2001 From: Huiyu Xie Date: Fri, 20 Dec 2024 22:35:27 -1000 Subject: [PATCH] Combine similar kernels using cooperative groups (#97) * Cooperative kernels * Complete * Remove benchmark --- src/auxiliary/configurators.jl | 81 ++- src/solvers/dg_2d.jl | 12 +- src/solvers/dg_3d.jl | 1081 +++++++++++++++++++++++--------- 3 files changed, 857 insertions(+), 317 deletions(-) diff --git a/src/auxiliary/configurators.jl b/src/auxiliary/configurators.jl index c2e04a7..813576a 100644 --- a/src/auxiliary/configurators.jl +++ b/src/auxiliary/configurators.jl @@ -2,7 +2,7 @@ # blocks to be used in the kernel, which optimizes the use of GPU resources. # 1D kernel configurator -# We hardcode 32 threads per block for 1D kernels +# We hardcode 32 threads per block for 1D kernels. function kernel_configurator_1d(kernel::HostKernel, x::Int) # config = launch_configuration(kernel.fun) # not used in this case @@ -12,9 +12,25 @@ function kernel_configurator_1d(kernel::HostKernel, x::Int) return (threads = threads, blocks = blocks) end +# 1D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_1d(kernel::HostKernel, x::Int) + # config = launch_configuration(kernel.fun) # not used in this case + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + threads = 32 # warp size is 32, if block size is less than 32, it will be padded to 32 + blocks = min(cld(x, threads), sm_count) + + return (threads = threads, blocks = blocks) +end + # 2D kernel configurator # We hardcode 32 threads for x dimension per block, and y dimension is determined -# by the number of threads returned by the launch configuration +# by the number of threads returned by the launch configuration. function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int) config = launch_configuration(kernel.fun) # get the number of threads @@ -31,9 +47,35 @@ function kernel_configurator_2d(kernel::HostKernel, x::Int, y::Int) return (threads = threads, blocks = blocks) end +# 2D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_2d(kernel::HostKernel, x::Int, y::Int) + config = launch_configuration(kernel.fun) # get the number of threads + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y) + blocks_x = cld(x, threads[1]) + blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x)) + + blocks = (blocks_x, blocks_y) + + return (threads = threads, blocks = blocks) +end + # 3D kernel configurator # We hardcode 32 threads for x dimension per block, y and z dimensions are determined -# by the number of threads returned by the launch configuration +# by the number of threads returned by the launch configuration. function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int) config = launch_configuration(kernel.fun) # get the number of threads @@ -56,6 +98,39 @@ function kernel_configurator_3d(kernel::HostKernel, x::Int, y::Int, z::Int) return (threads = threads, blocks = blocks) end +# 3D kernel configurator for cooperative launch +# Note that cooperative kernels can only launch as many blocks as there are SMs on the device, +# so we need to query the SM count first. Also, kernels launched with cooperative launch have +# to use stride loops to handle the constrained launch size. +function kernel_configurator_coop_3d(kernel::HostKernel, x::Int, y::Int, z::Int) + config = launch_configuration(kernel.fun) # get the number of threads + # Maybe pack properties into a struct + device = CUDA.device() + sm_count = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) # get number of SMs + + # y dimension + dims_y1 = cld(x * y, 32) + dims_y2 = max(fld(config.threads, 32), 1) + + dims_y = min(dims_y1, dims_y2) + + # z dimension + dims_z1 = cld(x * y * z, 32 * dims_y) + dims_z2 = max(fld(config.threads, 32 * dims_y), 1) + + dims_z = min(dims_z1, dims_z2) + + # x dimension is hardcoded to warp size 32 + threads = (32, dims_y, dims_z) + blocks_x = cld(x, threads[1]) + blocks_y = min(cld(y, threads[2]), fld(sm_count, blocks_x)) + blocks_z = min(cld(z, threads[3]), fld(sm_count, blocks_x * blocks_y)) + + blocks = (blocks_x, blocks_y, blocks_z) + + return (threads = threads, blocks = blocks) +end + # Deprecated old kernel configurators below # function configurator_1d(kernel::HostKernel, array::CuArray{<:Any, 1}) diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index de0643e..e7cae9f 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -902,7 +902,7 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_value fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{2}) + orientations) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z @@ -1619,12 +1619,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), @@ -1679,12 +1678,11 @@ function cuda_mortar_flux!(mesh::TreeMesh{2}, cache_mortars::True, nonconservati reverse_lower, neighbor_ids, large_sides, - orientations, - equations) + orientations) mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, fstar_secondary_lower, reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations; + orientations; kernel_configurator_3d(mortar_flux_copy_to_kernel, size(surface_flux_values, 1), size(surface_flux_values, 2), diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index e14df20..8bdf846 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -890,126 +890,248 @@ function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lowe return nothing end -# Kernel for interpolating data large to small on mortars - step 1 -function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, u, forward_upper, - forward_lower, neighbor_ids, large_sides, orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && - k <= size(tmp_upper_left, 5)) - u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - large_side = large_sides[k] - orientation = orientations[k] - large_element = neighbor_ids[5, k] - - leftright = large_side - - @inbounds begin - for j1j1 in axes(forward_lower, 2) - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - end - - for j1j1 in axes(forward_lower, 2) - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - large_element] * (large_side - 1) - end - end - end - - return nothing -end - -# Kernel for interpolating data large to small on mortars - step 2 -function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, - u_lower_right, tmp_upper_left, tmp_upper_right, - tmp_lower_left, tmp_lower_right, forward_upper, - forward_lower, large_sides) +# # Kernel for interpolating data large to small on mortars - step 1 +# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, u, forward_upper, +# forward_lower, neighbor_ids, large_sides, orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && +# k <= size(tmp_upper_left, 5)) +# u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# large_side = large_sides[k] +# orientation = orientations[k] +# large_element = neighbor_ids[5, k] + +# leftright = large_side + +# @inbounds begin +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) +# end + +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) +# end +# end +# end + +# return nothing +# end + +# # Kernel for interpolating data large to small on mortars - step 2 +# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, +# u_lower_right, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, forward_upper, +# forward_lower, large_sides) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && +# k <= size(u_upper_left, 5)) +# u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# leftright = large_sides[k] + +# @inbounds begin +# for j2j2 in axes(forward_upper, 2) +# u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_left[leftright, i, j1, j2j2, k] + +# u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_right[leftright, i, j1, j2j2, k] + +# u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_left[leftright, i, j1, j2j2, k] + +# u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_right[leftright, i, j1, j2j2, k] +# end +# end +# end + +# return nothing +# end + +# Kernel for interpolating data large to small on mortars +function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, + u, forward_upper, forward_lower, neighbor_ids, large_sides, + orientations) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && - k <= size(u_upper_left, 5)) - u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - leftright = large_sides[k] - - @inbounds begin - for j2j2 in axes(forward_upper, 2) - u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_left[leftright, i, j1, j2j2, k] - - u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_right[leftright, i, j1, j2j2, k] - - u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_left[leftright, i, j1, j2j2, k] - - u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_right[leftright, i, j1, j2j2, k] + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z + + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(tmp_upper_left, 2) + while j <= size(tmp_upper_left, 3)^2 + while k <= size(tmp_upper_left, 5) + u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + large_side = large_sides[k] + orientation = orientations[k] + large_element = neighbor_ids[5, k] + + leftright = large_side + + @inbounds begin + for j1j1 in axes(forward_lower, 2) + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + end + + for j1j1 in axes(forward_lower, 2) + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + end + + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) + + for j2j2 in axes(forward_upper, 2) + u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_left[leftright, i, j1, j2j2, k] + + u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_right[leftright, i, j1, j2j2, k] + + u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_left[leftright, i, j1, j2j2, k] + + u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_right[leftright, i, j1, j2j2, k] + end + end + k += stride_z end + j += stride_y end + i += stride_x end return nothing @@ -1150,8 +1272,128 @@ function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right end # # Kernel for copying mortar fluxes small to small and small to large - step 1 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, - tmp_lower_left, tmp_lower_right, +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, +# fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# lower_left_element = neighbor_ids[1, k] +# lower_right_element = neighbor_ids[2, k] +# upper_left_element = neighbor_ids[3, k] +# upper_right_element = neighbor_ids[4, k] +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 5 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 6`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation + large_side - 2 + +# surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 6 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 5`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j1j1 in axes(reverse_upper, 2) +# tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_upper_left[i, j1j1, j2, k] +# tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_upper_right[i, j1j1, j2, k] +# tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_lower_left[i, j1j1, j2, k] +# tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_lower_right[i, j1j1, j2, k] +# end +# end +# end + +# return nothing +# end + +# # Kernel for copying mortar fluxes small to small and small to large - step 2 +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations, equations::AbstractEquations{3}) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # See step 1 for the explanation of the following expression +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j2j2 in axes(reverse_lower, 2) +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_right[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_right[i, j1, j2j2, +# direction, +# large_element] +# end + +# surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, +# direction, +# large_element] +# end +# end + +# return nothing +# end + +# Kernel for copying mortar fluxes small to small and small to large +function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, fstar_secondary_upper_right, @@ -1162,108 +1404,98 @@ function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_up j = (blockIdx().y - 1) * blockDim().y + threadIdx().y k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - lower_left_element = neighbor_ids[1, k] - lower_right_element = neighbor_ids[2, k] - upper_left_element = neighbor_ids[3, k] - upper_right_element = neighbor_ids[4, k] - large_element = neighbor_ids[5, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + - # isequal(large_side, 1) * isequal(orientation, 2) * 3 + - # isequal(large_side, 1) * isequal(orientation, 3) * 5 + - # isequal(large_side, 2) * isequal(orientation, 1) * 2 + - # isequal(large_side, 2) * isequal(orientation, 2) * 4 + - # isequal(large_side, 2) * isequal(orientation, 3) * 6`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation + large_side - 2 - - surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + - # isequal(large_side, 1) * isequal(orientation, 2) * 4 + - # isequal(large_side, 1) * isequal(orientation, 3) * 6 + - # isequal(large_side, 2) * isequal(orientation, 1) * 1 + - # isequal(large_side, 2) * isequal(orientation, 2) * 3 + - # isequal(large_side, 2) * isequal(orientation, 3) * 5`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation - large_side + 1 - - @inbounds begin - for j1j1 in axes(reverse_upper, 2) - tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_upper_left[i, j1j1, j2, k] - tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_upper_right[i, j1j1, j2, k] - tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_lower_left[i, j1j1, j2, k] - tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_lower_right[i, j1j1, j2, k] - end - end - end - - return nothing -end - -# Kernel for copying mortar fluxes small to small and small to large - step 2 -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && - k <= length(orientations)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - large_element = neighbor_ids[5, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # See step 1 for the explanation of the following expression - direction = 2 * orientation - large_side + 1 + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z + + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(surface_flux_values, 1) + while j <= size(surface_flux_values, 2)^2 + while k <= length(orientations) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + lower_left_element = neighbor_ids[1, k] + lower_right_element = neighbor_ids[2, k] + upper_left_element = neighbor_ids[3, k] + upper_right_element = neighbor_ids[4, k] + large_element = neighbor_ids[5, k] + + large_side = large_sides[k] + orientation = orientations[k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + + # isequal(large_side, 1) * isequal(orientation, 2) * 3 + + # isequal(large_side, 1) * isequal(orientation, 3) * 5 + + # isequal(large_side, 2) * isequal(orientation, 1) * 2 + + # isequal(large_side, 2) * isequal(orientation, 2) * 4 + + # isequal(large_side, 2) * isequal(orientation, 3) * 6`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation + large_side - 2 + + surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + + # isequal(large_side, 1) * isequal(orientation, 2) * 4 + + # isequal(large_side, 1) * isequal(orientation, 3) * 6 + + # isequal(large_side, 2) * isequal(orientation, 1) * 1 + + # isequal(large_side, 2) * isequal(orientation, 2) * 3 + + # isequal(large_side, 2) * isequal(orientation, 3) * 5`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation - large_side + 1 + + @inbounds begin + for j1j1 in axes(reverse_upper, 2) + tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_upper_left[i, j1j1, j2, k] + tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_upper_right[i, j1j1, j2, k] + tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_lower_left[i, j1j1, j2, k] + tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_lower_right[i, j1j1, j2, k] + end + end - @inbounds begin - for j2j2 in axes(reverse_lower, 2) - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_right[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_right[i, j1, j2j2, - direction, - large_element] + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) + + @inbounds begin + for j2j2 in axes(reverse_lower, 2) + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_right[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_right[i, j1, j2j2, + direction, + large_element] + end + + surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, + direction, + large_element] + end + k += stride_z end - - surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, - direction, - large_element] + j += stride_y end + i += stride_x end return nothing @@ -1857,7 +2089,81 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::False, dg::D @assert iszero(length(cache.mortars.orientations)) end -# Pack kernels for prolonging solution to mortars +# # Pack kernels for prolonging solution to mortars +# function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache) +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# forward_upper = CuArray(dg.mortar.forward_upper) +# forward_lower = CuArray(dg.mortar.forward_lower) + +# prolong_mortars_small2small_kernel = @cuda launch=false prolong_mortars_small2small_kernel!(u_upper_left, +# u_upper_right, +# u_lower_left, +# u_lower_right, +# u, +# neighbor_ids, +# large_sides, +# orientations) +# prolong_mortars_small2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, u, +# neighbor_ids, large_sides, orientations; +# kernel_configurator_3d(prolong_mortars_small2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# tmp_upper_left = zero(similar(u_upper_left)) # undef to zero +# tmp_upper_right = zero(similar(u_upper_right)) # undef to zero +# tmp_lower_left = zero(similar(u_lower_left)) # undef to zero +# tmp_lower_right = zero(similar(u_lower_right)) # undef to zero + +# prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# u, +# forward_upper, +# forward_lower, +# neighbor_ids, +# large_sides, +# orientations) +# prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, u, forward_upper, forward_lower, +# neighbor_ids, large_sides, orientations; +# kernel_configurator_3d(prolong_mortars_large2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left, +# u_upper_right, +# u_lower_left, +# u_lower_right, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# forward_upper, +# forward_lower, +# large_sides) +# prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, +# tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, forward_upper, forward_lower, large_sides; +# kernel_configurator_3d(prolong_mortars_large2small_kernel, +# size(u_upper_left, 2), +# size(u_upper_left, 3)^2, +# size(u_upper_left, 5))...) + +# return nothing +# end + +# Pack kernels for prolonging solution to mortars (optimized) function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DGSEM, cache) neighbor_ids = cache.mortars.neighbor_ids large_sides = cache.mortars.large_sides @@ -1891,25 +2197,6 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG tmp_lower_left = zero(similar(u_lower_left)) # undef to zero tmp_lower_right = zero(similar(u_lower_right)) # undef to zero - # TODO: Combine these two kernels into one (synchronization) - prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - u, - forward_upper, - forward_lower, - neighbor_ids, - large_sides, - orientations) - prolong_mortars_large2small_kernel(tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, u, forward_upper, forward_lower, - neighbor_ids, large_sides, orientations; - kernel_configurator_3d(prolong_mortars_large2small_kernel, - size(u_upper_left, 2), - size(u_upper_left, 3)^2, - size(u_upper_left, 5))...) - prolong_mortars_large2small_kernel = @cuda launch=false prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, @@ -1918,16 +2205,19 @@ function cuda_prolong2mortars!(u, mesh::TreeMesh{3}, cache_mortars::True, dg::DG tmp_upper_right, tmp_lower_left, tmp_lower_right, - forward_upper, + u, forward_upper, forward_lower, - large_sides) + neighbor_ids, + large_sides, + orientations) prolong_mortars_large2small_kernel(u_upper_left, u_upper_right, u_lower_left, u_lower_right, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, forward_upper, forward_lower, large_sides; - kernel_configurator_3d(prolong_mortars_large2small_kernel, - size(u_upper_left, 2), - size(u_upper_left, 3)^2, - size(u_upper_left, 5))...) + tmp_lower_right, u, forward_upper, forward_lower, neighbor_ids, + large_sides, orientations; cooperative = true, + kernel_configurator_coop_3d(prolong_mortars_large2small_kernel, + size(u_upper_left, 2), + size(u_upper_left, 3)^2, + size(u_upper_left, 5))...) return nothing end @@ -1938,7 +2228,116 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::False, nonconservat @assert iszero(length(cache.mortars.orientations)) end -# Pack kernels for calculating mortar fluxes +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, +# equations, dg::DGSEM, cache) +# surface_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, equations, +# surface_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# equations, surface_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + +# Pack kernels for calculating mortar fluxes (optimized) function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::False, equations, dg::DGSEM, cache) surface_flux = dg.surface_integral.surface_flux @@ -1994,8 +2393,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2013,41 +2412,130 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end +# # Pack kernels for calculating mortar fluxes +# function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, +# equations, dg::DGSEM, cache) +# surface_flux, nonconservative_flux = dg.surface_integral.surface_flux + +# neighbor_ids = cache.mortars.neighbor_ids +# large_sides = cache.mortars.large_sides +# orientations = cache.mortars.orientations + +# # The original CPU arrays hold NaNs +# u_upper_left = cache.mortars.u_upper_left +# u_upper_right = cache.mortars.u_upper_right +# u_lower_left = cache.mortars.u_lower_left +# u_lower_right = cache.mortars.u_lower_right +# reverse_upper = CuArray(dg.mortar.reverse_upper) +# reverse_lower = CuArray(dg.mortar.reverse_lower) + +# surface_flux_values = cache.elements.surface_flux_values +# tmp_surface_flux_values = zero(similar(surface_flux_values)) # undef to zero + +# fstar_primary_upper_left = cache.fstar_primary_upper_left +# fstar_primary_upper_right = cache.fstar_primary_upper_right +# fstar_primary_lower_left = cache.fstar_primary_lower_left +# fstar_primary_lower_right = cache.fstar_primary_lower_right +# fstar_secondary_upper_left = cache.fstar_secondary_upper_left +# fstar_secondary_upper_right = cache.fstar_secondary_upper_right +# fstar_secondary_lower_left = cache.fstar_secondary_lower_left +# fstar_secondary_lower_right = cache.fstar_secondary_lower_right + +# mortar_flux_kernel = @cuda launch=false mortar_flux_kernel!(fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# u_upper_left, u_upper_right, +# u_lower_left, u_lower_right, +# orientations, large_sides, +# equations, surface_flux, +# nonconservative_flux) +# mortar_flux_kernel(fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, +# large_sides, equations, surface_flux, nonconservative_flux; +# kernel_configurator_3d(mortar_flux_kernel, size(u_upper_left, 3), +# size(u_upper_left, 4), +# length(orientations))...) + +# tmp_upper_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_upper_right = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero +# tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# fstar_primary_upper_left, +# fstar_primary_upper_right, +# fstar_primary_lower_left, +# fstar_primary_lower_right, +# fstar_secondary_upper_left, +# fstar_secondary_upper_right, +# fstar_secondary_lower_left, +# fstar_secondary_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, +# tmp_surface_flux_values, +# tmp_upper_left, +# tmp_upper_right, +# tmp_lower_left, +# tmp_lower_right, +# reverse_upper, +# reverse_lower, +# neighbor_ids, +# large_sides, +# orientations, +# equations) +# mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, +# reverse_lower, neighbor_ids, large_sides, orientations, equations; +# kernel_configurator_3d(mortar_flux_copy_to_kernel, +# size(surface_flux_values, 1), +# size(surface_flux_values, 2)^2, +# length(orientations))...) + +# return nothing +# end + # Pack kernels for calculating mortar fluxes function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservative_terms::True, equations, dg::DGSEM, cache) @@ -2105,8 +2593,8 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati tmp_lower_left = zero(similar(surface_flux_values)) # undef to zero tmp_lower_right = zero(similar(surface_flux_values)) # undef to zero - # TODO: Combine these two copy kernels into one (synchronization) mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, + tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, @@ -2124,37 +2612,16 @@ function cuda_mortar_flux!(mesh::TreeMesh{3}, cache_mortars::True, nonconservati neighbor_ids, large_sides, orientations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_upper_left, tmp_upper_right, tmp_lower_left, - tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) - - mortar_flux_copy_to_kernel = @cuda launch=false mortar_flux_copy_to_kernel!(surface_flux_values, - tmp_surface_flux_values, - tmp_upper_left, - tmp_upper_right, - tmp_lower_left, - tmp_lower_right, - reverse_upper, - reverse_lower, - neighbor_ids, - large_sides, - orientations, - equations) - mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, - tmp_upper_right, tmp_lower_left, tmp_lower_right, reverse_upper, - reverse_lower, neighbor_ids, large_sides, orientations, equations; - kernel_configurator_3d(mortar_flux_copy_to_kernel, - size(surface_flux_values, 1), - size(surface_flux_values, 2)^2, - length(orientations))...) + mortar_flux_copy_to_kernel(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, tmp_upper_right, + tmp_lower_left, tmp_lower_right, fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, fstar_secondary_upper_left, + fstar_secondary_upper_right, fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, orientations; + cooperative = true, + kernel_configurator_coop_3d(mortar_flux_copy_to_kernel, + size(surface_flux_values, 1), + size(surface_flux_values, 2)^2, + length(orientations))...) return nothing end