Skip to content

Commit

Permalink
Switch to less parallelism to avoid redundant computation (#107)
Browse files Browse the repository at this point in the history
* Start

* Complete 3D

* Complete 2D

* Complete 1D

* Complete
  • Loading branch information
huiyuxie authored Jan 7, 2025
1 parent fa5e818 commit 2a5cb3e
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 207 deletions.
108 changes: 40 additions & 68 deletions src/solvers/dg_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr)
return nothing
end

# Kernel for calculating fluxes and weak form
# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
############################################################################## New optimization
# Kernel for calculating fluxes and weak form
function flux_weak_form_kernel!(du, u, derivative_dhat,
equations::AbstractEquations{1}, flux::Any)
# Set tile width
Expand All @@ -53,50 +53,35 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,

# Get thread and block indices only we need to save registers
tx, ty = threadIdx().x, threadIdx().y

# We construct more threads than we need to allocate shared memory
# Note that treating j as either ty1 or ty2 is valid, and here we
# treat it as ty1
ty1 = div(ty - 1, tile_width) + 1
ty2 = rem(ty - 1, tile_width) + 1

# We launch one block in x direction so i = tx
# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z

# Tile the computation (restrict to one tile here)
value = zero(eltype(du))

# Load global `derivative_dhat` into shared memory
# Transposed memory access or not?
@inbounds begin
shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
for ty2 in axes(du, 2)
@inbounds shmem_dhat[ty2, ty] = derivative_dhat[ty, ty2] # transposed access
end

# Load global `flux_arr` into shared memory
# Note that `flux_arr` is removed for smaller GPU memory allocation
u_node = get_node_vars(u, equations, ty1, k)
# Compute flux values
u_node = get_node_vars(u, equations, ty, k)
flux_node = flux(u_node, 1, equations)

@inbounds begin
shmem_flux[tx, ty1] = flux_node[tx]
end
@inbounds shmem_flux[tx, ty] = flux_node[tx]

sync_threads()

# Loop within one block to get weak form
# TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2,
# then consolidate each computation back to ty1
# How to replace shared memory `shmem_flux` with `flux_node`?
# TODO: Avoid potential bank conflicts
for thread in 1:tile_width
@inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread]
@inbounds value += shmem_dhat[thread, ty] * shmem_flux[tx, thread]
end

# Synchronization is not needed here if we use only one tile
# sync_threads()

# Finalize the weak form
@inbounds du[tx, ty1, k] = value
@inbounds du[tx, ty, k] = value

return nothing
end
Expand Down Expand Up @@ -141,8 +126,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr,
return nothing
end

############################################################################## New optimization
# Kernel for calculating volume fluxes and volume integrals
# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
function volume_flux_integral_kernel!(du, u, derivative_split,
equations::AbstractEquations{1}, volume_flux::Any)
# Set tile width
Expand All @@ -152,59 +137,46 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Allocate dynamic shared memory
shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
offset += sizeof(eltype(du)) * tile_width^2
shmem_vflux = @cuDynamicSharedMem(eltype(du),
(size(du, 1), tile_width, tile_width), offset)
shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width),
offset)

# Get thread and block indices only we need to save registers
tx, ty = threadIdx().x, threadIdx().y

# We launch one block in y direction so j = ty
ty1 = div(ty - 1, tile_width) + 1 # same as j1
ty2 = rem(ty - 1, tile_width) + 1 # same as j2

# We launch one block in x direction so i = tx
# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
ty = threadIdx().y
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z

# j1 = div(j - 1, tile_width) + 1
# j2 = rem(j - 1, tile_width) + 1

# Tile the computation (restrict to one tile here)
value = zero(eltype(du))

# Load global `derivative_split` into shared memory
# Transposed memory access or not?
@inbounds begin
shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
# Tile the computation (set to one tile here)
# Initialize the values
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty] = zero(eltype(du))
end

# Load global `volume_flux_arr` into shared memory
# Note that `volume_flux_arr` is removed for smaller GPU memory allocation
u_node = get_node_vars(u, equations, ty1, k)
u_node1 = get_node_vars(u, equations, ty2, k)

volume_flux_node = volume_flux(u_node, u_node1, 1, equations)

@inbounds begin
shmem_vflux[tx, ty1, ty2] = volume_flux_node[tx]
# Load global `derivative_split` into shared memory
for ty2 in axes(du, 2)
@inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] # transposed access
end

sync_threads()

# Loop within one block to get weak form
# TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2,
# then consolidate each computation back to ty1
# How to replace shared memory `shmem_vflux` with `volume_flux_node`?
# Compute volume fluxes
# How to store nodes in shared memory?
for thread in 1:tile_width
@inbounds value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread]
# Volume flux is heavy in computation so we should try best to avoid redundant
# computation, i.e., use for loop along x direction here
volume_flux_node = volume_flux(get_node_vars(u, equations, ty, k),
get_node_vars(u, equations, thread, k),
1, equations)

# TODO: Avoid potential bank conflicts
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * volume_flux_node[tx]
end
end

# Synchronization is not needed here if we use only one tile
# sync_threads()

# Finalize the weak form
@inbounds du[tx, ty1, k] = value
# Finalize the values
for tx in axes(du, 1)
@inbounds du[tx, ty, k] = shmem_value[tx, ty]
end

return nothing
end
Expand Down Expand Up @@ -761,7 +733,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms,
equations, flux)
flux_weak_form_kernel(du, u, derivative_dhat, equations, flux;
shmem = shmem_size,
threads = (size(du, 1), size(du, 2)^2, 1),
threads = (size(du, 1), size(du, 2), 1),
blocks = (1, 1, size(du, 3)))
end

Expand Down Expand Up @@ -797,12 +769,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
volume_integral_kernel(du, derivative_split, volume_flux_arr, equations;
kernel_configurator_3d(volume_integral_kernel, size(du)...)...)
else
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(eltype(du))
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)) * sizeof(eltype(du))
volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
equations, volume_flux)
volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
shmem = shmem_size,
threads = (size(du, 1), size(du, 2)^2, 1),
threads = (1, size(du, 2), 1),
blocks = (1, 1, size(du, 3)))
end

Expand Down
109 changes: 42 additions & 67 deletions src/solvers/dg_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
return nothing
end

############################################################################## New optimization
# Kernel for calculating fluxes and weak form
# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
function flux_weak_form_kernel!(du, u, derivative_dhat,
equations::AbstractEquations{2}, flux::Any)
# Set tile width
Expand All @@ -69,30 +69,18 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,

# Get thread and block indices only we need to save registers
tx, ty = threadIdx().x, threadIdx().y

# We launch one block in y direction so j = ty
ty1 = div(ty - 1, tile_width) + 1 # same as j1
ty2 = rem(ty - 1, tile_width) + 1 # same as j2

# We launch one block in x direction so i = tx
# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z

# j1 = div(j - 1, tile_width) + 1
# j2 = rem(j - 1, tile_width) + 1
ty1 = div(ty - 1, tile_width) + 1
ty2 = rem(ty - 1, tile_width) + 1

# Tile the computation (restrict to one tile here)
value = zero(eltype(du))

# Load global `derivative_dhat` into shared memory
# Transposed memory access or not?
@inbounds begin
shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
end
@inbounds shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2] # transposed access

# Load global `flux_arr1` and `flux_arr2` into shared memory, and note
# that they are removed now for smaller GPU memory allocation
# Compute flux values
u_node = get_node_vars(u, equations, ty1, ty2, k)
flux_node1 = flux(u_node, 1, equations)
flux_node2 = flux(u_node, 2, equations)
Expand All @@ -106,7 +94,6 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,

# Loop within one block to get weak form
# TODO: Avoid potential bank conflicts
# How to replace shared memory `shmem_flux` with `flux_node`?
for thread in 1:tile_width
@inbounds begin
value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1]
Expand Down Expand Up @@ -178,8 +165,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
return nothing
end

############################################################################## New optimization
# Kernel for calculating volume fluxes and volume integrals
# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
function volume_flux_integral_kernel!(du, u, derivative_split,
equations::AbstractEquations{2}, volume_flux::Any)
# Set tile width
Expand All @@ -189,68 +176,56 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Allocate dynamic shared memory
shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
offset += sizeof(eltype(du)) * tile_width^2
shmem_vflux = @cuDynamicSharedMem(eltype(du),
(size(du, 1), tile_width, tile_width, tile_width, 2),
shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width, tile_width),
offset)

# Get thread and block indices only we need save registers
tx, ty = threadIdx().x, threadIdx().y

# We launch one block in y direction so j = ty
ty1 = div(ty - 1, tile_width^2) + 1 # same as j1
ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j2
ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j3

# We launch one block in x direction so i = tx
# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
ty = threadIdx().y
k = (blockIdx().z - 1) * blockDim().z + threadIdx().z

# j1 = div(j - 1, u2^2) + 1
# j2 = div(rem(j - 1, u2^2), u2) + 1
# j3 = rem(rem(j - 1, u2^2), u2) + 1
ty1 = div(ty - 1, tile_width) + 1
ty2 = rem(ty - 1, tile_width) + 1

# Tile the computation (restrict to one tile here)
value = zero(eltype(du))

# Load global `derivative_split` into shared memory
# Transposed memory access or not?
@inbounds begin
shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
# Tile the computation (set to one tile here)
# Initialize the values
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du))
end

# Load global `volume_flux_arr1` and `volume_flux_arr2` into shared memory,
# and note that they are removed now for smaller GPU memory allocation
u_node = get_node_vars(u, equations, ty1, ty2, k)
u_node1 = get_node_vars(u, equations, ty3, ty2, k)
u_node2 = get_node_vars(u, equations, ty1, ty3, k)

volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations)
volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations)

@inbounds begin
shmem_vflux[tx, ty1, ty3, ty2, 1] = volume_flux_node1[tx]
shmem_vflux[tx, ty1, ty2, ty3, 2] = volume_flux_node2[tx]
end

sync_threads()
# Load global `derivative_split` into shared memory
@inbounds shmem_split[ty2, ty1] = derivative_split[ty1, ty2] # transposed access

# Loop within one block to get weak form
# TODO: Avoid potential bank conflicts and parallelize (ty1, ty2) with threads to ty3,
# then consolidate each computation back to (ty1, ty2)
# How to replace shared memory `shmem_flux` with `flux_node`?
# Compute volume fluxes
# How to store nodes in shared memory?
for thread in 1:tile_width
@inbounds begin
value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread, ty2, 1]
value += shmem_split[thread, ty2] * shmem_vflux[tx, ty1, ty2, thread, 2]
# Volume flux is heavy in computation so we should try best to avoid redundant
# computation, i.e., use for loop along x direction here
u_node = get_node_vars(u, equations, ty1, ty2, k)
volume_flux_node1 = volume_flux(u_node,
get_node_vars(u, equations, thread, ty2, k),
1, equations)
volume_flux_node2 = volume_flux(u_node,
get_node_vars(u, equations, ty1, thread, k),
2, equations)

# TODO: Avoid potential bank conflicts
# Try another way to parallelize (ty1, ty2) with threads to ty3, then
# consolidate each computation back to (ty1, ty2)
for tx in axes(du, 1)
@inbounds begin
shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx]
shmem_value[tx, ty1, ty2] += shmem_split[thread, ty2] * volume_flux_node2[tx]
end
end
end

# Synchronization is not needed here if we use only one tile
# sync_threads()

# Finalize the weak form
@inbounds du[tx, ty1, ty2, k] = value
# Finalize the values
for tx in axes(du, 1)
@inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2]
end

return nothing
end
Expand Down Expand Up @@ -1262,12 +1237,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
kernel_configurator_3d(volume_integral_kernel, size(du, 1),
size(du, 2)^2, size(du, 4))...)
else
shmem_size = (size(du, 2)^2 + size(du, 1) * 2 * size(du, 2)^3) * sizeof(eltype(du))
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(eltype(du))
volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
equations, volume_flux)
volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
shmem = shmem_size,
threads = (size(du, 1), size(du, 2)^3, 1),
threads = (1, size(du, 2)^2, 1),
blocks = (1, 1, size(du, 4)))
end

Expand Down
Loading

0 comments on commit 2a5cb3e

Please sign in to comment.