Switch to less parallelism to avoid redundant computation (#107)

* Start * Complete 3D * Complete 2D * Complete 1D * Complete
trixi-gpu · Jan 7, 2025 · 2a5cb3e · 2a5cb3e
1 parent fa5e818
commit 2a5cb3e
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 207 deletions.
diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl
@@ -37,8 +37,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr)
     return nothing
 end
 
-# Kernel for calculating fluxes and weak form 
-# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
+############################################################################## New optimization
+# Kernel for calculating fluxes and weak form
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{1}, flux::Any)
     # Set tile width
@@ -53,50 +53,35 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
 
     # Get thread and block indices only we need to save registers
     tx, ty = threadIdx().x, threadIdx().y
-
-    # We construct more threads than we need to allocate shared memory
-    # Note that treating j as either ty1 or ty2 is valid, and here we 
-    # treat it as ty1
-    ty1 = div(ty - 1, tile_width) + 1
-    ty2 = rem(ty - 1, tile_width) + 1
-
-    # We launch one block in x direction so i = tx
-    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
     # Tile the computation (restrict to one tile here)
     value = zero(eltype(du))
 
     # Load global `derivative_dhat` into shared memory
-    # Transposed memory access or not?
-    @inbounds begin
-        shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
+    for ty2 in axes(du, 2)
+        @inbounds shmem_dhat[ty2, ty] = derivative_dhat[ty, ty2] # transposed access
     end
 
-    # Load global `flux_arr` into shared memory
-    # Note that `flux_arr` is removed for smaller GPU memory allocation
-    u_node = get_node_vars(u, equations, ty1, k)
+    # Compute flux values
+    u_node = get_node_vars(u, equations, ty, k)
     flux_node = flux(u_node, 1, equations)
 
-    @inbounds begin
-        shmem_flux[tx, ty1] = flux_node[tx]
-    end
+    @inbounds shmem_flux[tx, ty] = flux_node[tx]
 
     sync_threads()
 
     # Loop within one block to get weak form
-    # TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2, 
-    # then consolidate each computation back to ty1
-    # How to replace shared memory `shmem_flux` with `flux_node`?
+    # TODO: Avoid potential bank conflicts
     for thread in 1:tile_width
-        @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread]
+        @inbounds value += shmem_dhat[thread, ty] * shmem_flux[tx, thread]
     end
 
     # Synchronization is not needed here if we use only one tile
     # sync_threads()
 
     # Finalize the weak form
-    @inbounds du[tx, ty1, k] = value
+    @inbounds du[tx, ty, k] = value
 
     return nothing
 end
@@ -141,8 +126,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr,
     return nothing
 end
 
+############################################################################## New optimization
 # Kernel for calculating volume fluxes and volume integrals
-# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
 function volume_flux_integral_kernel!(du, u, derivative_split,
                                       equations::AbstractEquations{1}, volume_flux::Any)
     # Set tile width
@@ -152,59 +137,46 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
     # Allocate dynamic shared memory
     shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
     offset += sizeof(eltype(du)) * tile_width^2
-    shmem_vflux = @cuDynamicSharedMem(eltype(du),
-                                      (size(du, 1), tile_width, tile_width), offset)
+    shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width),
+                                      offset)
 
     # Get thread and block indices only we need to save registers
-    tx, ty = threadIdx().x, threadIdx().y
-
-    # We launch one block in y direction so j = ty
-    ty1 = div(ty - 1, tile_width) + 1 # same as j1
-    ty2 = rem(ty - 1, tile_width) + 1 # same as j2
-
-    # We launch one block in x direction so i = tx
-    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    ty = threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    # j1 = div(j - 1, tile_width) + 1 
-    # j2 = rem(j - 1, tile_width) + 1
-
-    # Tile the computation (restrict to one tile here)
-    value = zero(eltype(du))
-
-    # Load global `derivative_split` into shared memory
-    # Transposed memory access or not?
-    @inbounds begin
-        shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
+    # Tile the computation (set to one tile here)
+    # Initialize the values
+    for tx in axes(du, 1)
+        @inbounds shmem_value[tx, ty] = zero(eltype(du))
     end
 
-    # Load global `volume_flux_arr` into shared memory
-    # Note that `volume_flux_arr` is removed for smaller GPU memory allocation
-    u_node = get_node_vars(u, equations, ty1, k)
-    u_node1 = get_node_vars(u, equations, ty2, k)
-
-    volume_flux_node = volume_flux(u_node, u_node1, 1, equations)
-
-    @inbounds begin
-        shmem_vflux[tx, ty1, ty2] = volume_flux_node[tx]
+    # Load global `derivative_split` into shared memory
+    for ty2 in axes(du, 2)
+        @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] # transposed access
     end
 
-    sync_threads()
-
-    # Loop within one block to get weak form
-    # TODO: Avoid potential bank conflicts and parallelize ty1 with threads to ty2, 
-    # then consolidate each computation back to ty1
-    # How to replace shared memory `shmem_vflux` with `volume_flux_node`?
+    # Compute volume fluxes
+    # How to store nodes in shared memory?
     for thread in 1:tile_width
-        @inbounds value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread]
+        # Volume flux is heavy in computation so we should try best to avoid redundant 
+        # computation, i.e., use for loop along x direction here
+        volume_flux_node = volume_flux(get_node_vars(u, equations, ty, k),
+                                       get_node_vars(u, equations, thread, k),
+                                       1, equations)
+
+        # TODO: Avoid potential bank conflicts
+        for tx in axes(du, 1)
+            @inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * volume_flux_node[tx]
+        end
     end
 
     # Synchronization is not needed here if we use only one tile
     # sync_threads()
 
-    # Finalize the weak form
-    @inbounds du[tx, ty1, k] = value
+    # Finalize the values
+    for tx in axes(du, 1)
+        @inbounds du[tx, ty, k] = shmem_value[tx, ty]
+    end
 
     return nothing
 end
@@ -761,7 +733,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms,
                                                                           equations, flux)
         flux_weak_form_kernel(du, u, derivative_dhat, equations, flux;
                               shmem = shmem_size,
-                              threads = (size(du, 1), size(du, 2)^2, 1),
+                              threads = (size(du, 1), size(du, 2), 1),
                               blocks = (1, 1, size(du, 3)))
     end
 
@@ -797,12 +769,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
         volume_integral_kernel(du, derivative_split, volume_flux_arr, equations;
                                kernel_configurator_3d(volume_integral_kernel, size(du)...)...)
     else
-        shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(eltype(du))
+        shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)) * sizeof(eltype(du))
         volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
                                                                                       equations, volume_flux)
         volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
                                     shmem = shmem_size,
-                                    threads = (size(du, 1), size(du, 2)^2, 1),
+                                    threads = (1, size(du, 2), 1),
                                     blocks = (1, 1, size(du, 3)))
     end
 

diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl
@@ -52,8 +52,8 @@ function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2)
     return nothing
 end
 
+############################################################################## New optimization
 # Kernel for calculating fluxes and weak form
-# An optimized version of the fusion of `flux_kernel!` and `weak_form_kernel!`
 function flux_weak_form_kernel!(du, u, derivative_dhat,
                                 equations::AbstractEquations{2}, flux::Any)
     # Set tile width
@@ -69,30 +69,18 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
 
     # Get thread and block indices only we need to save registers
     tx, ty = threadIdx().x, threadIdx().y
-
-    # We launch one block in y direction so j = ty
-    ty1 = div(ty - 1, tile_width) + 1 # same as j1
-    ty2 = rem(ty - 1, tile_width) + 1 # same as j2
-
-    # We launch one block in x direction so i = tx
-    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    # j1 = div(j - 1, tile_width) + 1 
-    # j2 = rem(j - 1, tile_width) + 1
+    ty1 = div(ty - 1, tile_width) + 1
+    ty2 = rem(ty - 1, tile_width) + 1
 
     # Tile the computation (restrict to one tile here)
     value = zero(eltype(du))
 
     # Load global `derivative_dhat` into shared memory
-    # Transposed memory access or not?
-    @inbounds begin
-        shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2]
-    end
+    @inbounds shmem_dhat[ty2, ty1] = derivative_dhat[ty1, ty2] # transposed access
 
-    # Load global `flux_arr1` and `flux_arr2` into shared memory, and note 
-    # that they are removed now for smaller GPU memory allocation
+    # Compute flux values
     u_node = get_node_vars(u, equations, ty1, ty2, k)
     flux_node1 = flux(u_node, 1, equations)
     flux_node2 = flux(u_node, 2, equations)
@@ -106,7 +94,6 @@ function flux_weak_form_kernel!(du, u, derivative_dhat,
 
     # Loop within one block to get weak form
     # TODO: Avoid potential bank conflicts
-    # How to replace shared memory `shmem_flux` with `flux_node`?
     for thread in 1:tile_width
         @inbounds begin
             value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1]
@@ -178,8 +165,8 @@ function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_
     return nothing
 end
 
+############################################################################## New optimization
 # Kernel for calculating volume fluxes and volume integrals
-# An optimized version of the fusion of `volume_flux_kernel!` and `volume_integral_kernel!`
 function volume_flux_integral_kernel!(du, u, derivative_split,
                                       equations::AbstractEquations{2}, volume_flux::Any)
     # Set tile width
@@ -189,68 +176,56 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
     # Allocate dynamic shared memory
     shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
     offset += sizeof(eltype(du)) * tile_width^2
-    shmem_vflux = @cuDynamicSharedMem(eltype(du),
-                                      (size(du, 1), tile_width, tile_width, tile_width, 2),
+    shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width, tile_width),
                                       offset)
 
     # Get thread and block indices only we need save registers
-    tx, ty = threadIdx().x, threadIdx().y
-
-    # We launch one block in y direction so j = ty
-    ty1 = div(ty - 1, tile_width^2) + 1 # same as j1
-    ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j2
-    ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 # same as j3
-
-    # We launch one block in x direction so i = tx
-    # i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-    # j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
+    ty = threadIdx().y
     k = (blockIdx().z - 1) * blockDim().z + threadIdx().z
 
-    # j1 = div(j - 1, u2^2) + 1
-    # j2 = div(rem(j - 1, u2^2), u2) + 1
-    # j3 = rem(rem(j - 1, u2^2), u2) + 1
+    ty1 = div(ty - 1, tile_width) + 1
+    ty2 = rem(ty - 1, tile_width) + 1
 
-    # Tile the computation (restrict to one tile here)
-    value = zero(eltype(du))
-
-    # Load global `derivative_split` into shared memory
-    # Transposed memory access or not?
-    @inbounds begin
-        shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
+    # Tile the computation (set to one tile here)
+    # Initialize the values
+    for tx in axes(du, 1)
+        @inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du))
     end
 
-    # Load global `volume_flux_arr1` and `volume_flux_arr2` into shared memory, 
-    # and note that they are removed now for smaller GPU memory allocation
-    u_node = get_node_vars(u, equations, ty1, ty2, k)
-    u_node1 = get_node_vars(u, equations, ty3, ty2, k)
-    u_node2 = get_node_vars(u, equations, ty1, ty3, k)
-
-    volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations)
-    volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations)
-
-    @inbounds begin
-        shmem_vflux[tx, ty1, ty3, ty2, 1] = volume_flux_node1[tx]
-        shmem_vflux[tx, ty1, ty2, ty3, 2] = volume_flux_node2[tx]
-    end
-
-    sync_threads()
+    # Load global `derivative_split` into shared memory
+    @inbounds shmem_split[ty2, ty1] = derivative_split[ty1, ty2] # transposed access
 
-    # Loop within one block to get weak form
-    # TODO: Avoid potential bank conflicts and parallelize (ty1, ty2) with threads to ty3, 
-    # then consolidate each computation back to (ty1, ty2)
-    # How to replace shared memory `shmem_flux` with `flux_node`?
+    # Compute volume fluxes
+    # How to store nodes in shared memory?
     for thread in 1:tile_width
-        @inbounds begin
-            value += shmem_split[thread, ty1] * shmem_vflux[tx, ty1, thread, ty2, 1]
-            value += shmem_split[thread, ty2] * shmem_vflux[tx, ty1, ty2, thread, 2]
+        # Volume flux is heavy in computation so we should try best to avoid redundant 
+        # computation, i.e., use for loop along x direction here
+        u_node = get_node_vars(u, equations, ty1, ty2, k)
+        volume_flux_node1 = volume_flux(u_node,
+                                        get_node_vars(u, equations, thread, ty2, k),
+                                        1, equations)
+        volume_flux_node2 = volume_flux(u_node,
+                                        get_node_vars(u, equations, ty1, thread, k),
+                                        2, equations)
+
+        # TODO: Avoid potential bank conflicts 
+        # Try another way to parallelize (ty1, ty2) with threads to ty3, then 
+        # consolidate each computation back to (ty1, ty2)
+        for tx in axes(du, 1)
+            @inbounds begin
+                shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx]
+                shmem_value[tx, ty1, ty2] += shmem_split[thread, ty2] * volume_flux_node2[tx]
+            end
         end
     end
 
     # Synchronization is not needed here if we use only one tile
     # sync_threads()
 
-    # Finalize the weak form
-    @inbounds du[tx, ty1, ty2, k] = value
+    # Finalize the values
+    for tx in axes(du, 1)
+        @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2]
+    end
 
     return nothing
 end
@@ -1262,12 +1237,12 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
                                kernel_configurator_3d(volume_integral_kernel, size(du, 1),
                                                       size(du, 2)^2, size(du, 4))...)
     else
-        shmem_size = (size(du, 2)^2 + size(du, 1) * 2 * size(du, 2)^3) * sizeof(eltype(du))
+        shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(eltype(du))
         volume_flux_integral_kernel = @cuda launch=false volume_flux_integral_kernel!(du, u, derivative_split,
                                                                                       equations, volume_flux)
         volume_flux_integral_kernel(du, u, derivative_split, equations, volume_flux;
                                     shmem = shmem_size,
-                                    threads = (size(du, 1), size(du, 2)^3, 1),
+                                    threads = (1, size(du, 2)^2, 1),
                                     blocks = (1, 1, size(du, 4)))
     end