From dfc1b26ef53935715c46f66885b9c77fbfea8755 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Mon, 12 Jun 2023 11:28:26 +1000 Subject: [PATCH 1/4] Add ackowledgements to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c55f3fd..e6662a2 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ Repo for the NCI-NVIDIA hackaton on June 2023. +This work was completed in part at the NCI-NVIDIA hackaton, part of the Open Hackathons program. The authors would like to acknowledge OpenACC-Standard.org for their support. + ## Objective The objective is to implement sum-factorization techniques for GPU architectures, within From a73fd6edaf2e41cc5dd5cceb6e4d88e61020e657 Mon Sep 17 00:00:00 2001 From: Jordi Manyer Date: Fri, 16 Jun 2023 17:26:58 +1000 Subject: [PATCH 2/4] Save progress --- src/GPUKernels.jl | 477 ++++++++++++++++++++++++++-- src/NCIHackaton2023.jl | 3 + src/SumFactorizationMaps.jl | 3 +- test/GPU_v0.jl | 11 +- test/GPU_v1.jl | 19 +- test/GPU_v2.jl | 25 +- test/GPU_v3.jl | 27 +- test/GPU_v5.jl | 111 +++++++ test/GPU_v7.jl | 111 +++++++ test/{cdot.jl => benchmark_cdot.jl} | 0 test/my_test.jl | 9 + 11 files changed, 743 insertions(+), 53 deletions(-) create mode 100644 test/GPU_v5.jl create mode 100644 test/GPU_v7.jl rename test/{cdot.jl => benchmark_cdot.jl} (100%) diff --git a/src/GPUKernels.jl b/src/GPUKernels.jl index 53cb79f..8b0b45c 100644 --- a/src/GPUKernels.jl +++ b/src/GPUKernels.jl @@ -239,10 +239,10 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, tidx_step = blockDim().x s1 = blockDim().y*D*SB[1]*SB[2]; s2 = blockDim().y*D*SQ[1]*SB[2]; s3 = blockDim().y*D*SQ[1]*SQ[2]; - Z = @cuDynamicSharedMem(Float64,s1+s2+s3) + Z = @cuDynamicSharedMem(Float64,max(s1,s3)+s2) Z1 = view(Z,1:s1) - Z2 = view(Z,s1+1:s1+s2) - Z3 = view(Z,s1+s2+1:s1+s2+s3) + Z2 = view(Z,max(s1,s3)+1:s1+s2) + Z3 = view(Z,1:s3) cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y while cell <= nCells @@ -259,7 +259,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, id = ids[i] z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r - Z1[z1_idx] = x[max(id, 1)] * (id > 0) + Z1[z1_idx] = x[abs(id)] * (id > 0) loop_idx += tidx_step end CUDA.sync_threads() @@ -272,7 +272,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z2[z2_idx] = 0.0 - for j1 in 1:SB[1] + @inbounds for j1 in 1:SB[1] z1_idx = (tidy - 1) * SB[2] * SB[1] * D + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r Z2[z2_idx] += mats[i1, j1, 1, r] * Z1[z1_idx] end @@ -287,11 +287,10 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z3[z3_idx] = 0.0 - for j2 in 1:SB[2] + @inbounds for j2 in 1:SB[2] z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z3[z3_idx] += mats[i2, j2, 2, r] * Z2[z2_idx] end - loop_idx += tidx_step end CUDA.sync_threads() @@ -315,7 +314,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2]) z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z2[z2_idx] = 0.0 - for i2 in 1:SQ[2] + @inbounds for i2 in 1:SQ[2] z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z2[z2_idx] += mats[i2, j2, 2, r] * Z3[z3_idx] end @@ -329,7 +328,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, r, j1, j2 = @index_to_tuple(loop_idx, D, SB[1], SB[2]) z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r Z1[z1_idx] = 0.0 - for i1 in 1:SQ[1] + @inbounds for i1 in 1:SQ[1] z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r Z1[z1_idx] += mats[i1, j1, 1, r] * Z2[z2_idx] end @@ -347,9 +346,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, j2 = I[2] id = ids[i] z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r - if id > 0 - CUDA.@atomic y[id] += Z1[z1_idx] - end + CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0) loop_idx += tidx_step end CUDA.sync_threads() @@ -365,10 +362,11 @@ end """ SUMFAC-GPU Kernel v3 Fourth version. Reordering of the matrices to obtain better memory access patterns. + Matrices are still in ConstantMemory. - We process blockDim().y cells at the same time. - We use blockDim().x threads to process each cell. """ -function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats) where {D, SB, SQ} +function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats,dof_map) where {D, SB, SQ} dof_map = m.dof_map CUDA.Const(ij_mats) CUDA.Const(ji_mats) @@ -392,13 +390,10 @@ function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, s = D * SB[1] * SB[2] while loop_idx <= s r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2]) - I = dof_map[i] - j1 = I[1] - j2 = I[2] + I = dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2]) id = ids[i] z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r - - Z1[z1_idx] = x[max(id, 1)] * (id > 0) + Z1[z1_idx] = x[abs(id)] * (id > 0) loop_idx += tidx_step end CUDA.sync_threads() @@ -481,14 +476,10 @@ function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, s = D * SB[1] * SB[2] while loop_idx <= s r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2]) - I = dof_map[i] - j1 = I[1] - j2 = I[2] + I = dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2]) id = ids[i] z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r - if id > 0 - CUDA.@atomic y[id] += Z1[z1_idx] - end + CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0) loop_idx += tidx_step end CUDA.sync_threads() @@ -636,4 +627,442 @@ function gpu_mul_v4!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, end return +end + + +""" + SUMFAC-GPU Kernel v5 + Try to reorder some of the indexes to obtain better accesses to SharedMemory. + Matrices still in ConstantMemory. + - We process blockDim().y cells at the same time. + - We use blockDim().x threads to process each cell. +""" +function gpu_mul_v5!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats, dof_map) where {D, SB, SQ} + CUDA.Const(ij_mats) + CUDA.Const(ji_mats) + CUDA.Const(dof_map) + tidy = threadIdx().y + tidx = threadIdx().x + tidx_step = blockDim().x + + s1 = blockDim().y*D*SB[1]*SB[2]; s2 = blockDim().y*D*SQ[1]*SB[2]; s3 = blockDim().y*D*SQ[1]*SQ[2]; + Z = @cuDynamicSharedMem(Float64,max(s1,s3)+s2) + Z1 = view(Z,1:s1) + Z2 = view(Z,max(s1,s3)+1:s1+s2) + Z3 = view(Z,1:s3) + + cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y + while cell <= nCells + # Scatter + ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1) + + loop_idx = tidx + s = SB[1] * SB[2] + while loop_idx <= s + i = dof_map[loop_idx] + id = ids[i] + val = x[abs(id)] * (id > 0) + @inbounds for r in 1:D + z1_idx = (tidy - 1) * s * D + (loop_idx-1)*D + r + Z1[z1_idx] = val + end + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Forward pass + loop_idx = tidx + s = D * SQ[1] * SB[2] + while loop_idx <= s + r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2]) + val = 0.0 + @inbounds for j1 in 1:SB[1] + z1_idx = (tidy - 1) * SB[2] * SB[1] * D + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r + val += ji_mats[j1,r,i1,1] * Z1[z1_idx] + end + z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r + Z2[z2_idx] = val + loop_idx += tidx_step + end + CUDA.sync_threads() + + loop_idx = tidx + s = D * SQ[1] * SQ[2] + while loop_idx <= s + r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2]) + val = 0.0 + @inbounds for j2 in 1:SB[2] + z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r + val += ji_mats[j2,r,i2,2] * Z2[z2_idx] + end + z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r + Z3[z3_idx] = val + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Apply weights + loop_idx = tidx + s = D * SQ[1] * SQ[2] + while loop_idx <= s + r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2]) + idx = (i2 - 1) * SQ[1] + i1 + z3_idx = (tidy - 1) * s + (idx - 1) * D + r + Z3[z3_idx] *= wq[idx] + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Backward pass + loop_idx = tidx + s = D * SQ[1] * SB[2] + while loop_idx <= s + r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2]) + val = 0.0 + @inbounds for i2 in 1:SQ[2] + z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r + val += ij_mats[i2,r,j2,2] * Z3[z3_idx] + end + z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r + Z2[z2_idx] = val + loop_idx += tidx_step + end + CUDA.sync_threads() + + loop_idx = tidx + s = D * SB[1] * SB[2] + while loop_idx <= s + r, j1, j2 = @index_to_tuple(loop_idx, D, SB[1], SB[2]) + val = 0.0 + @inbounds for i1 in 1:SQ[1] + z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r + val += ij_mats[i1,r,j1,1] * Z2[z2_idx] + end + z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r + Z1[z1_idx] = val + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Assemble + loop_idx = tidx + s = SB[1] * SB[2] + while loop_idx <= s + i = dof_map[loop_idx] + id = ids[i] + val = 0.0 + @inbounds for r in 1:D + z1_idx = ((tidy - 1) * s + loop_idx-1) * D + r + val += Z1[z1_idx] + end + CUDA.@atomic y[abs(id)] += val * (id > 0) + loop_idx += tidx_step + end + CUDA.sync_threads() + + cell += gridDim().x * blockDim().y + end + + return +end + + + +function gpu_mul_v6!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq,ij_mats,ji_mats) where {D, SB, SQ} + dof_map = m.dof_map + #CUDA.Const(mats) + #CUDA.Const(dof_map) + tidy = threadIdx().y + tidx = threadIdx().x + tidx_step = blockDim().x + + s13 = blockDim().y*D*max(SB[1]*SB[2],SQ[1]*SQ[2]) + s2 = blockDim().y*D*SQ[1]*SB[2] + smats = D*D*SB[1]*SQ[1] + + Z = @cuDynamicSharedMem(Float64,s13+s2+2*smats) + Z1 = view(Z,1:s13) + Z2 = view(Z,s13+1:s13+s2) + Z3 = view(Z,1:s13) + + # Bring matrices to shared memory + f_mats = view(Z,s13+s2+1:s13+s2+smats) + b_mats = view(Z,s13+s2+smats+1:s13+s2+2*smats) + loop_idx = (tidy-1) * tidx_step + tidx + while loop_idx <= smats + f_mats[loop_idx] = ji_mats[loop_idx] + b_mats[loop_idx] = ij_mats[loop_idx] + loop_idx += tidx_step*blockDim().y + end + CUDA.sync_threads() + + cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y + while cell <= nCells + # Scatter + ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1) + + loop_idx = tidx + s = D * SB[1] * SB[2] + while loop_idx <= s + r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2]) + I = dof_map[i]; j1 = I[1]; j2 = I[2] + id = ids[i] + z1_idx = (tidy - 1) * s + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r + + Z1[z1_idx] = x[abs(id)] * (id > 0) + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Forward pass + loop_idx = tidx + s = D * SQ[1] * SB[2] + while loop_idx <= s + r, j2, i1 = @index_to_tuple(loop_idx, D, SB[2], SQ[1]) + + z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Not coalesced + Z2[z2_idx] = 0.0 + @inbounds for j1 in 1:SB[1] + z1_idx = (tidy - 1) * SB[1] * SB[2] * D + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r # Coalesced + #fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r + bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r + Z2[z2_idx] += f_mats[bm_idx] * Z1[z1_idx] + end + loop_idx += tidx_step + end + CUDA.sync_threads() + + loop_idx = tidx + s = D * SQ[1] * SQ[2] + while loop_idx <= s + r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2]) + + z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced + Z3[z3_idx] = 0.0 + @inbounds for j2 in 1:SB[2] + z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced + #fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r + bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r + Z3[z3_idx] += f_mats[bm_idx] * Z2[z2_idx] + end + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Apply weights + loop_idx = tidx + s = D * SQ[1] * SQ[2] + while loop_idx <= s + r, idx = @index_to_tuple(loop_idx, D, SQ[1]*SQ[2]) + z3_idx = (tidy - 1) * s + loop_idx + Z3[z3_idx] *= wq[idx] + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Backward pass + loop_idx = tidx + s = D * SQ[1] * SB[2] + while loop_idx <= s + r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2]) + z2_idx = (tidy - 1) * s + (i1 - 1) * SB[2] * D + (j2 - 1) * D + r # Not coalesced + Z2[z2_idx] = 0.0 + @inbounds for i2 in 1:SQ[2] + z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced + #bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r + fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r + Z2[z2_idx] += b_mats[fm_idx] * Z3[z3_idx] + end + loop_idx += tidx_step + end + CUDA.sync_threads() + + loop_idx = tidx + s = D * SB[1] * SB[2] + while loop_idx <= s + r, j2, j1 = @index_to_tuple(loop_idx, D, SB[2], SB[1]) + z1_idx = (tidy - 1) * s + loop_idx + Z1[z1_idx] = 0.0 + @inbounds for i1 in 1:SQ[1] + z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (i1 - 1) * SB[2] * D + (j2 - 1) * D + r # Coalesced + #bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r + fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r + Z1[z1_idx] += b_mats[fm_idx] * Z2[z2_idx] + end + loop_idx += tidx_step + end + CUDA.sync_threads() + + # Assemble + loop_idx = tidx + s = D * SB[1] * SB[2] + while loop_idx <= s + r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2]) + I = dof_map[i] + j1 = I[1] + j2 = I[2] + id = ids[i] + z1_idx = (tidy - 1) * s + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r + CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0) + loop_idx += tidx_step + end + CUDA.sync_threads() + + cell += gridDim().x * blockDim().y + end + + return +end + + +@generated function gpu_mul_v7!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq,ij_mats,ji_mats,_dof_map,::Val{blockdims}) where {D, SB, SQ,blockdims} + s13 = blockdims[2]*D*max(SB[1]*SB[2],SQ[1]*SQ[2]) + s2 = blockdims[2]*D*SQ[1]*SB[2] + smats = D*D*SB[1]*SQ[1] + smaps = prod(SB) + + return quote + tidy = threadIdx().y + tidx = threadIdx().x + + Z = @cuStaticSharedMem(Float64,$(s13+s2+2*smats)) + Z1 = view(Z,1:$(s13)) + Z2 = view(Z,$(s13+1):$(s13+s2)) + Z3 = view(Z,1:$(s13)) + + # Bring matrices to shared memory + f_mats = view(Z,$(s13+s2+1):$(s13+s2+smats)) + b_mats = view(Z,$(s13+s2+smats+1):$(s13+s2+2*smats)) + loop_idx = (tidy-1) * $(blockdims[1]) + tidx + while loop_idx <= $(smats) + f_mats[loop_idx] = ij_mats[loop_idx] + b_mats[loop_idx] = ji_mats[loop_idx] + loop_idx += $(blockdims[1]*blockdims[2]) + end + + dof_map = @cuStaticSharedMem(Int32,$smaps) + loop_idx = (tidy-1) * $(blockdims[1]) + tidx + while loop_idx <= $(smaps) + dof_map[loop_idx] = _dof_map[loop_idx] + loop_idx += $(blockdims[1]*blockdims[2]) + end + + CUDA.sync_threads() + + cell = (blockIdx().x - 1) * $(blockdims[2]) + threadIdx().y + while cell <= nCells + # Scatter + ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1) + + loop_idx = tidx + while loop_idx <= $(SB[1] * SB[2]) + id = ids[dof_map[loop_idx]] + xi = x[abs(id)] * (id > 0) + @inbounds for r in 1:$D + z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r + Z1[z1_idx] = xi + end + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + # Forward pass + loop_idx = tidx + while loop_idx <= $(D * SQ[1] * SB[2]) + r, j2, i1 = @index_to_tuple(loop_idx, D, SB[2], SQ[1]) + + val = 0.0 + @inbounds for j1 in 1:$(SB[1]) + z1_idx = (tidy - 1) * $(SB[1] * SB[2] * D) + (j1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Coalesced + fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r + #bm_idx = (i1 - 1) * $(SB[1] * D) + (j1 - 1) * $D + r + val += f_mats[fm_idx] * Z1[z1_idx] + end + z2_idx = (tidy - 1) * $(D * SQ[1] * SB[2]) + (j2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Not coalesced + Z2[z2_idx] = val + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + loop_idx = tidx + while loop_idx <= $(D * SQ[1] * SQ[2]) + r, i1, i2 = @index_to_tuple(loop_idx, $D, $(SQ[1]), $(SQ[2])) + + val = 0.0 + @inbounds for j2 in 1:$(SB[2]) + z2_idx = (tidy - 1) * $(SB[2] * SQ[1] * D) + (j2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced + fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r + #bm_idx = $(SQ[2]*SB[2]*D) + (i2 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r + val += f_mats[fm_idx] * Z2[z2_idx] + end + z3_idx = (tidy - 1) * $(D * SQ[1] * SQ[2]) + (i2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced + Z3[z3_idx] = val + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + # Apply weights + loop_idx = tidx + while loop_idx <= $(D * SQ[1] * SQ[2]) + r, idx = @index_to_tuple(loop_idx, $D, $(SQ[1]*SQ[2])) + z3_idx = (tidy - 1) * $(D * SQ[1] * SQ[2]) + loop_idx + Z3[z3_idx] *= wq[idx] + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + # Backward pass + loop_idx = tidx + while loop_idx <= $(D * SQ[1] * SB[2]) + r, i1, j2 = @index_to_tuple(loop_idx, $D, $(SQ[1]), $(SB[2])) + + val = 0.0 + @inbounds for i2 in 1:$(SQ[2]) + z3_idx = (tidy - 1) * $(SQ[2] * SQ[1] * D) + (i2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced + bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r + #fm_idx = $(SB[2]*SQ[2]*D) + (j2 - 1) * $(SQ[2] * D) + (i2 - 1) * $D + r + val += b_mats[bm_idx] * Z3[z3_idx] + end + z2_idx = (tidy - 1) * $(D * SQ[1] * SB[2]) + (i1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Not coalesced + Z2[z2_idx] = val + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + loop_idx = tidx + while loop_idx <= $(D * SB[1] * SB[2]) + r, j2, j1 = @index_to_tuple(loop_idx, $D, $(SB[2]), $(SB[1])) + + val = 0.0 + @inbounds for i1 in 1:$(SQ[1]) + z2_idx = (tidy - 1) * $(SB[2] * SQ[1] * D) + (i1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Coalesced + bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r + #fm_idx = (j1 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r + val += b_mats[bm_idx] * Z2[z2_idx] + end + z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + loop_idx + Z1[z1_idx] = val + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + # Assemble + loop_idx = tidx + while loop_idx <= $(SB[1] * SB[2]) + id = ids[dof_map[loop_idx]] + yi = 0.0 + @inbounds for r in 1:D + z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r + yi += Z1[z1_idx] + end + CUDA.@atomic y[abs(id)] += yi * (id > 0) + loop_idx += blockdims[1] + end + CUDA.sync_threads() + + cell += gridDim().x * blockDim().y + end + + return + end end \ No newline at end of file diff --git a/src/NCIHackaton2023.jl b/src/NCIHackaton2023.jl index 12e222c..f2cf126 100644 --- a/src/NCIHackaton2023.jl +++ b/src/NCIHackaton2023.jl @@ -31,6 +31,9 @@ export gpu_mul_v1! export gpu_mul_v2! export gpu_mul_v3! export gpu_mul_v4! +export gpu_mul_v5! +export gpu_mul_v6! +export gpu_mul_v7! export count_manual_flops_poisson_matrix_free diff --git a/src/SumFactorizationMaps.jl b/src/SumFactorizationMaps.jl index 84f4bb9..5a3756e 100644 --- a/src/SumFactorizationMaps.jl +++ b/src/SumFactorizationMaps.jl @@ -28,7 +28,8 @@ function to_gpu(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} mats[:, :, r, k] = m.mats[2][r, k] end gpu_mats = CuArray(mats) - return SumFactorizationMap{D, SB, SQ}(gpu_mats, CuArray(m.dof_map)) + D_32 = Int32(D); SB_32 = Tuple(map(Int32,SB)); SQ_32 = Tuple(map(Int32,SQ)) + return SumFactorizationMap{D_32, SB_32, SQ_32}(gpu_mats, CuArray(m.dof_map)) end function _get_terms(poly::Polytope, orders) diff --git a/test/GPU_v0.jl b/test/GPU_v0.jl index bfad0ae..0f3c4b8 100644 --- a/test/GPU_v0.jl +++ b/test/GPU_v0.jl @@ -79,14 +79,13 @@ mul!(y_ref, A_lazy, x_ref) cpu_y = Array(y) cpu_y ≈ y_ref +# Benchmark +niter = 100 +time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) + # Profile CUDA.@profile begin for iter in 1:10 CUDA.@sync kernel(kernel_args...; config...) end -end - -# Benchmark -niter = 100 -time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) - +end \ No newline at end of file diff --git a/test/GPU_v1.jl b/test/GPU_v1.jl index a2cfdf1..dec75a9 100644 --- a/test/GPU_v1.jl +++ b/test/GPU_v1.jl @@ -16,8 +16,8 @@ using NCIHackaton2023 # Parameters D = 2 # Problem dimension -fe_orders = Tuple(fill(1, D)) # FE element orders -quad_orders = Tuple(fill(4, D)) # Quadrature orders +fe_orders = Tuple(fill(4, D)) # FE element orders +quad_orders = Tuple(fill(6, D)) # Quadrature orders # Setup n = 512 @@ -47,8 +47,8 @@ A_lazy = LazyMatrix(m, U, V, dΩ) ############################################################################################ # GPU implementation nCells = num_cells(Ω) -nt = 4 * 192 -nb = 80 +nt = 16*56 +nb = 320 gpu_m = to_gpu(m) gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U)); @@ -71,7 +71,7 @@ kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, gpu_Zk...) kernel = @cuda name = "gpu_mul_v1" launch = false gpu_mul_v1!(kernel_args...); config = launch_configuration(kernel.fun) -config = (threads = (32,20), blocks = 80) +config = (threads = (16,56), blocks = 320) kernel(kernel_args...; config...) y_ref = zeros(length(b)) @@ -80,13 +80,12 @@ mul!(y_ref, A_lazy, x_ref) cpu_y = Array(y) cpu_y ≈ y_ref -# Profile +# Benchmark +niter = 100 +time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) + CUDA.@profile begin for iter in 1:10 CUDA.@sync kernel(kernel_args...; config...) end end - -# Benchmark -niter = 100 -time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) diff --git a/test/GPU_v2.jl b/test/GPU_v2.jl index 09cb8f9..53ece57 100644 --- a/test/GPU_v2.jl +++ b/test/GPU_v2.jl @@ -56,6 +56,17 @@ gpu_wq = CuArray(cell_wq.value) gpu_jq = CuArray(cell_jq.value) gpu_djq = CuArray(cell_djq.value) +function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + ji_mats = zeros((D,SB[1],SQ[1],D)) + ij_mats = zeros((D,SQ[1],SB[1],D)) + for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1] + ji_mats[r,j,i,k] = m.mats[2][k,r][i,j] + ij_mats[r,i,j,k] = m.mats[2][k,r][i,j] + end + return CuArray(ij_mats), CuArray(ji_mats) +end +ij_mats,ji_mats = get_mats(m) + # Caches D, SB, SQ = get_dimensional_parameters(m) @@ -63,13 +74,13 @@ D, SB, SQ = get_dimensional_parameters(m) x_ref = ones(size(b)) x = CuArray(x_ref) y = CuArray(zeros(size(b))) -kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq) +kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq,ij_mats,ji_mats) -kernel = @cuda name = "gpu_mul_v2" launch = false gpu_mul_v2!(kernel_args...); +kernel = @cuda name = "gpu_mul_v2" launch = false gpu_mul_v6!(kernel_args...); config = launch_configuration(kernel.fun) -mem = 16*D*(prod(SB) + SQ[1]*SB[2] + prod(SQ))*sizeof(Float64) -config = (threads=(32,16),blocks=160,shmem=mem) +mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64) +config = (threads=(8,64),blocks=320,shmem=mem) kernel(kernel_args...;config...) y_ref = zeros(length(b)) @@ -80,3 +91,9 @@ cpu_y ≈ y_ref niter = 100 time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) + +CUDA.@profile begin + for iter in 1:10 + CUDA.@sync kernel(kernel_args...; config...) + end +end diff --git a/test/GPU_v3.jl b/test/GPU_v3.jl index ad2c2b0..cfd9df4 100644 --- a/test/GPU_v3.jl +++ b/test/GPU_v3.jl @@ -1,6 +1,6 @@ """ - SUMFAC-GPU v2 - Basic kernel. Starting to take advantage of Shared & Constant memory. + SUMFAC-GPU v3 + Tackling the issue of memory access coalescence. """ using Test @@ -20,7 +20,7 @@ fe_orders = Tuple(fill(4, D)) # FE element orders quad_orders = Tuple(fill(6, D)) # Quadrature orders # Setup -n = 1024 +n = 512 domain = repeat([0, 1], D) partition = fill(n, D) model = CartesianDiscreteModel(domain, partition) @@ -53,6 +53,16 @@ function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} return CuArray(ij_mats), CuArray(ji_mats) end +function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + dof_map = Vector{Int32}(undef,prod(SB)) + for i in 1:prod(SB) + I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2]) + idx = (j2 - 1) * SB[1] + j1 + dof_map[idx] = i + end + return CuArray(dof_map) +end + ############################################################################################ ############################################################################################ # GPU implementation @@ -61,6 +71,7 @@ nCells = num_cells(Ω) gpu_m = to_gpu(m) gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U)); ij_mats, ji_mats = get_mats(m) +dof_map = get_dof_map(m) cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities gpu_wq = CuArray(cell_wq.value) @@ -68,24 +79,24 @@ gpu_jq = CuArray(cell_jq.value) gpu_djq = CuArray(cell_djq.value) # Caches -D, SB, SQ = get_dimensional_parameters(m) +D, SB, SQ = get_dimensional_parameters(gpu_m) # Comparison CPU vs GPU x_ref = ones(size(b)) x = CuArray(x_ref) y = CuArray(zeros(size(b))) -kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats) +kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats, dof_map) kernel = @cuda name = "gpu_mul_v3" launch = false gpu_mul_v3!(kernel_args...); config = launch_configuration(kernel.fun) -mem = 16*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) -config = (threads=(16,16),blocks=320,shmem=mem) +mem = 32*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) +config = (threads=(8,32),blocks=320,shmem=mem) kernel(kernel_args...;config...) y_ref = zeros(length(b)) @elapsed mul!(y_ref, A_lazy, x_ref) -@elapsed mul!(y_ref, A, x_ref) +#@elapsed mul!(y_ref, A, x_ref) cpu_y = Array(y) cpu_y ≈ y_ref diff --git a/test/GPU_v5.jl b/test/GPU_v5.jl new file mode 100644 index 0000000..97abe53 --- /dev/null +++ b/test/GPU_v5.jl @@ -0,0 +1,111 @@ +""" + SUMFAC-GPU v3/v5 + Tackling the issue of memory access coalescence. +""" + +using Test +using LinearAlgebra + +using Gridap +using Gridap.TensorValues +using Gridap.Arrays + +using CUDA +using Adapt +using NCIHackaton2023 + +# Parameters +D = 2 # Problem dimension +fe_orders = Tuple(fill(4, D)) # FE element orders +quad_orders = Tuple(fill(6, D)) # Quadrature orders + +# Setup +n = 512 +domain = repeat([0, 1], D) +partition = fill(n, D) +model = CartesianDiscreteModel(domain, partition) + +Ω = Triangulation(model) +dΩ = Measure(Ω, quad_orders[1]) + +g(x) = 0.0 +reffe = ReferenceFE(lagrangian, Float64, fe_orders) +V = FESpace(Ω, reffe; dirichlet_tags = ["boundary"]) +U = TrialFESpace(V, g) + +# Assembled matrix +f(x) = 1.0 +op = AffineFEOperator((u, v) -> ∫(∇(u) ⋅ ∇(v))dΩ, v -> ∫(v * f)dΩ, U, V) +A = get_matrix(op) +b = get_vector(op) + +# Sum-Factorization based matrix +m = SumFactorizationMap(D, fe_orders, quad_orders) +A_lazy = LazyMatrix(m, U, V, dΩ) + +function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + ji_mats = zeros((SB[1],D,SQ[1],D)) + ij_mats = zeros((SQ[1],D,SB[1],D)) + for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1] + ji_mats[j,r,i,k] = m.mats[2][k,r][i,j] + ij_mats[i,r,j,k] = m.mats[2][k,r][i,j] + end + return CuArray(ij_mats), CuArray(ji_mats) +end + +function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + dof_map = Vector{Int32}(undef,prod(SB)) + for i in 1:prod(SB) + I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2]) + idx = (j2 - 1) * SB[1] + j1 + dof_map[idx] = i + end + return CuArray(dof_map) +end + +############################################################################################ +############################################################################################ +# GPU implementation +nCells = num_cells(Ω) + +gpu_m = to_gpu(m) +gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U)); +ij_mats, ji_mats = get_mats(m) +dof_map = get_dof_map(m) + +cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities +gpu_wq = CuArray(cell_wq.value) +gpu_jq = CuArray(cell_jq.value) +gpu_djq = CuArray(cell_djq.value) + +# Caches +D, SB, SQ = get_dimensional_parameters(gpu_m) + +# Comparison CPU vs GPU +x_ref = ones(size(b)) +x = CuArray(x_ref) +y = CuArray(zeros(size(b))) +kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats, dof_map) + +kernel = @cuda name = "gpu_mul_v5" launch = false gpu_mul_v5!(kernel_args...); +config = launch_configuration(kernel.fun) + +mem = 32*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) +config = (threads=(8,32),blocks=320,shmem=mem) +kernel(kernel_args...;config...) + +y_ref = zeros(length(b)) +@elapsed mul!(y_ref, A_lazy, x_ref) +#@elapsed mul!(y_ref, A, x_ref) + +cpu_y = Array(y) +cpu_y ≈ y_ref + +niter = 100 +time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) + +CUDA.@profile begin + for iter in 1:10 + CUDA.@sync kernel(kernel_args...; config...) + end +end diff --git a/test/GPU_v7.jl b/test/GPU_v7.jl new file mode 100644 index 0000000..4cc8983 --- /dev/null +++ b/test/GPU_v7.jl @@ -0,0 +1,111 @@ +""" + SUMFAC-GPU v2 + Basic kernel. Starting to take advantage of Shared & Constant memory. +""" + +using Test +using LinearAlgebra + +using Gridap +using Gridap.TensorValues +using Gridap.Arrays + +using CUDA +using Adapt +using NCIHackaton2023 + +# Parameters +D = 2 # Problem dimension +fe_orders = Tuple(fill(4, D)) # FE element orders +quad_orders = Tuple(fill(6, D)) # Quadrature orders + +# Setup +n = 512 +domain = repeat([0, 1], D) +partition = fill(n, D) +model = CartesianDiscreteModel(domain, partition) + +Ω = Triangulation(model) +dΩ = Measure(Ω, quad_orders[1]) + +g(x) = 0.0 +reffe = ReferenceFE(lagrangian, Float64, fe_orders) +V = FESpace(Ω, reffe; dirichlet_tags = ["boundary"]) +U = TrialFESpace(V, g) + +# Assembled matrix +f(x) = 1.0 +#op = AffineFEOperator((u, v) -> ∫(∇(u) ⋅ ∇(v))dΩ, v -> ∫(v * f)dΩ, U, V) +#A = get_matrix(op) +#b = get_vector(op) + +# Sum-Factorization based matrix +m = SumFactorizationMap(D, fe_orders, quad_orders) +A_lazy = LazyMatrix(m, U, V, dΩ) + +############################################################################################ +############################################################################################ +# GPU implementation +nCells = num_cells(Ω) +nDofs = num_free_dofs(U) + +gpu_m = to_gpu(m) +gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U)); + +cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities +gpu_wq = CuArray(cell_wq.value) +gpu_jq = CuArray(cell_jq.value) +gpu_djq = CuArray(cell_djq.value) + +function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + ji_mats = zeros((D,SB[1],SQ[1],D)) + ij_mats = zeros((D,SQ[1],SB[1],D)) + for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1] + ji_mats[r,j,i,k] = m.mats[2][k,r][i,j] + ij_mats[r,i,j,k] = m.mats[2][k,r][i,j] + end + return CuArray(ij_mats), CuArray(ji_mats) +end +ij_mats,ji_mats = get_mats(m) + +function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ} + dof_map = Vector{Int32}(undef,prod(SB)) + for i in 1:prod(SB) + I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2]) + idx = (j1 - 1) * SB[1] + j2 + dof_map[idx] = i + end + return CuArray(dof_map) +end +dof_map = get_dof_map(m) + +# Caches +D, SB, SQ = get_dimensional_parameters(m) + +# Comparison CPU vs GPU +x_ref = ones(nDofs) +x = CuArray(x_ref) +y = CuArray(zeros(nDofs)) +kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq,ij_mats,ji_mats,dof_map,Val((8,64))) + +kernel = @cuda name = "gpu_mul_v7" launch = false gpu_mul_v7!(kernel_args...); +config = launch_configuration(kernel.fun) + +mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64) +config = (threads=(8,64),blocks=1280)#,shmem=mem) +kernel(kernel_args...;config...) + +y_ref = zeros(nDofs) +@elapsed mul!(y_ref, A_lazy, x_ref) + +cpu_y = Array(y) +cpu_y ≈ y_ref + +niter = 200 +time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter) + +CUDA.@profile begin + for iter in 1:10 + CUDA.@sync kernel(kernel_args...; config...) + end +end \ No newline at end of file diff --git a/test/cdot.jl b/test/benchmark_cdot.jl similarity index 100% rename from test/cdot.jl rename to test/benchmark_cdot.jl diff --git a/test/my_test.jl b/test/my_test.jl index 4fce094..622206d 100644 --- a/test/my_test.jl +++ b/test/my_test.jl @@ -17,3 +17,12 @@ end main() + +############################################################################################ + + + +dof_map + + + From 308e527565a9c1b9a9f61b410c0ec5081cb0352b Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Mon, 19 Jun 2023 11:29:16 +1000 Subject: [PATCH 3/4] minor changes --- profile_cuda_ncu.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/profile_cuda_ncu.sh b/profile_cuda_ncu.sh index 470f358..ae80d08 100755 --- a/profile_cuda_ncu.sh +++ b/profile_cuda_ncu.sh @@ -14,7 +14,10 @@ module load nvidia-hpc-sdk/22.11 export LD_LIBRARY_PATH=~/bin/julia-1.8.5/lib/julia/:$LD_LIBRARY_PATH ncu --set full -k regex:gpu_mul --target-processes all -o result_v1 julia --project=. test/GPU_v1.jl +ncu --set full -k regex:gpu_mul --target-processes all -o result_v2 julia --project=. test/GPU_v2.jl ncu --set full -k regex:gpu_mul --target-processes all -o result_v3 julia --project=. test/GPU_v3.jl +ncu --set full -k regex:gpu_mul --target-processes all -o result_v5 julia --project=. test/GPU_v5.jl +ncu --set full -k regex:gpu_mul --target-processes all -o result_v7 julia --project=. test/GPU_v7.jl #ncu launch --trace cuda julia --project=. -e'include("test/GPU_v0.jl")' #ncu launch --trace cuda julia --project=. -e'include("test/GPU_v1.jl")' From 4d02a70ffc7017f51ebc5b6e83cd96b2e452f324 Mon Sep 17 00:00:00 2001 From: JordiManyer Date: Tue, 27 Jun 2023 18:15:21 +1000 Subject: [PATCH 4/4] Save progress --- profile_cuda_ncu.sh | 4 ++-- src/GPUKernels.jl | 34 +++++++++++++++++++++------------- test/GPU_v7.jl | 2 +- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/profile_cuda_ncu.sh b/profile_cuda_ncu.sh index ae80d08..51910f0 100755 --- a/profile_cuda_ncu.sh +++ b/profile_cuda_ncu.sh @@ -1,12 +1,12 @@ #!/bin/bash -#PBS -P vp91 +#PBS -P kr97 #PBS -q gpuvolta #PBS -l walltime=01:00:00 #PBS -l ncpus=12 #PBS -l ngpus=1 #PBS -l mem=90GB #PBS -l jobfs=10GB -#PBS -l storage=scratch/vp91 +#PBS -l storage=scratch/kr97 module load cuda module load nvidia-hpc-sdk/22.11 diff --git a/src/GPUKernels.jl b/src/GPUKernels.jl index 8b0b45c..81ef25e 100644 --- a/src/GPUKernels.jl +++ b/src/GPUKernels.jl @@ -925,21 +925,13 @@ end tidy = threadIdx().y tidx = threadIdx().x - Z = @cuStaticSharedMem(Float64,$(s13+s2+2*smats)) + Z = @cuStaticSharedMem(Float64,$(s13+s2)) Z1 = view(Z,1:$(s13)) Z2 = view(Z,$(s13+1):$(s13+s2)) Z3 = view(Z,1:$(s13)) # Bring matrices to shared memory - f_mats = view(Z,$(s13+s2+1):$(s13+s2+smats)) - b_mats = view(Z,$(s13+s2+smats+1):$(s13+s2+2*smats)) - loop_idx = (tidy-1) * $(blockdims[1]) + tidx - while loop_idx <= $(smats) - f_mats[loop_idx] = ij_mats[loop_idx] - b_mats[loop_idx] = ji_mats[loop_idx] - loop_idx += $(blockdims[1]*blockdims[2]) - end - + ids = @cuStaticSharedMem(Int64,$smaps) dof_map = @cuStaticSharedMem(Int32,$smaps) loop_idx = (tidy-1) * $(blockdims[1]) + tidx while loop_idx <= $(smaps) @@ -949,14 +941,30 @@ end CUDA.sync_threads() + f_mats = @cuStaticSharedMem(Float64,$smats) + #b_mats = @cuStaticSharedMem(Float64,$smats) + loop_idx = (tidy-1) * $(blockdims[1]) + tidx + while loop_idx <= $(smats) + f_mats[loop_idx] = ij_mats[loop_idx] + #b_mats[loop_idx] = ji_mats[loop_idx] + loop_idx += $(blockdims[1]*blockdims[2]) + end + b_mats = f_mats + cell = (blockIdx().x - 1) * $(blockdims[2]) + threadIdx().y while cell <= nCells # Scatter - ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1) + _ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1) + loop_idx = tidx + while loop_idx <= $(SB[1] * SB[2]) + ids[loop_idx] = _ids[dof_map[loop_idx]] + loop_idx += blockdims[1] + end + CUDA.sync_threads() loop_idx = tidx while loop_idx <= $(SB[1] * SB[2]) - id = ids[dof_map[loop_idx]] + id = ids[loop_idx] #ids[dof_map[loop_idx]] xi = x[abs(id)] * (id > 0) @inbounds for r in 1:$D z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r @@ -1049,7 +1057,7 @@ end # Assemble loop_idx = tidx while loop_idx <= $(SB[1] * SB[2]) - id = ids[dof_map[loop_idx]] + id = ids[loop_idx] #ids[dof_map[loop_idx]] yi = 0.0 @inbounds for r in 1:D z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r diff --git a/test/GPU_v7.jl b/test/GPU_v7.jl index 4cc8983..f91bcdc 100644 --- a/test/GPU_v7.jl +++ b/test/GPU_v7.jl @@ -92,7 +92,7 @@ kernel = @cuda name = "gpu_mul_v7" launch = false gpu_mul_v7!(kernel_args...); config = launch_configuration(kernel.fun) mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64) -config = (threads=(8,64),blocks=1280)#,shmem=mem) +config = (threads=(5,64),blocks=1280)#,shmem=mem) kernel(kernel_args...;config...) y_ref = zeros(nDofs)