From dfc1b26ef53935715c46f66885b9c77fbfea8755 Mon Sep 17 00:00:00 2001
From: JordiManyer <jordi.manyer@monash.edu>
Date: Mon, 12 Jun 2023 11:28:26 +1000
Subject: [PATCH 1/4] Add ackowledgements to README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index c55f3fd..e6662a2 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 Repo for the NCI-NVIDIA hackaton on June 2023.
 
+This work was completed in part at the NCI-NVIDIA hackaton, part of the Open Hackathons program. The authors would like to acknowledge OpenACC-Standard.org for their support.
+
 ## Objective
 
 The objective is to implement sum-factorization techniques for GPU architectures, within

From a73fd6edaf2e41cc5dd5cceb6e4d88e61020e657 Mon Sep 17 00:00:00 2001
From: Jordi Manyer <jordi.manyer@monash.edu>
Date: Fri, 16 Jun 2023 17:26:58 +1000
Subject: [PATCH 2/4] Save progress

---
 src/GPUKernels.jl                   | 477 ++++++++++++++++++++++++++--
 src/NCIHackaton2023.jl              |   3 +
 src/SumFactorizationMaps.jl         |   3 +-
 test/GPU_v0.jl                      |  11 +-
 test/GPU_v1.jl                      |  19 +-
 test/GPU_v2.jl                      |  25 +-
 test/GPU_v3.jl                      |  27 +-
 test/GPU_v5.jl                      | 111 +++++++
 test/GPU_v7.jl                      | 111 +++++++
 test/{cdot.jl => benchmark_cdot.jl} |   0
 test/my_test.jl                     |   9 +
 11 files changed, 743 insertions(+), 53 deletions(-)
 create mode 100644 test/GPU_v5.jl
 create mode 100644 test/GPU_v7.jl
 rename test/{cdot.jl => benchmark_cdot.jl} (100%)

diff --git a/src/GPUKernels.jl b/src/GPUKernels.jl
index 53cb79f..8b0b45c 100644
--- a/src/GPUKernels.jl
+++ b/src/GPUKernels.jl
@@ -239,10 +239,10 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 	tidx_step  = blockDim().x
 	
   s1 = blockDim().y*D*SB[1]*SB[2]; s2 = blockDim().y*D*SQ[1]*SB[2]; s3 = blockDim().y*D*SQ[1]*SQ[2];
-  Z  = @cuDynamicSharedMem(Float64,s1+s2+s3)
+  Z  = @cuDynamicSharedMem(Float64,max(s1,s3)+s2)
   Z1 = view(Z,1:s1)
-  Z2 = view(Z,s1+1:s1+s2)
-  Z3 = view(Z,s1+s2+1:s1+s2+s3)
+  Z2 = view(Z,max(s1,s3)+1:s1+s2)
+  Z3 = view(Z,1:s3)
 
 	cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y
 	while cell <= nCells
@@ -259,7 +259,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 			id = ids[i]
 			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
 
-			Z1[z1_idx] = x[max(id, 1)] * (id > 0)
+			Z1[z1_idx] = x[abs(id)] * (id > 0)
 			loop_idx += tidx_step
 		end
 		CUDA.sync_threads()
@@ -272,7 +272,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 
 			z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 			Z2[z2_idx] = 0.0
-			for j1 in 1:SB[1]
+			@inbounds for j1 in 1:SB[1]
 				z1_idx = (tidy - 1) * SB[2] * SB[1] * D + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
 				Z2[z2_idx] += mats[i1, j1, 1, r] * Z1[z1_idx]
 			end
@@ -287,11 +287,10 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 
 			z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 			Z3[z3_idx] = 0.0
-			for j2 in 1:SB[2]
+			@inbounds for j2 in 1:SB[2]
 				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 				Z3[z3_idx] += mats[i2, j2, 2, r] * Z2[z2_idx]
 			end
-
 			loop_idx += tidx_step
 		end
 		CUDA.sync_threads()
@@ -315,7 +314,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 			r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2])
 			z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 			Z2[z2_idx] = 0.0
-			for i2 in 1:SQ[2]
+			@inbounds for i2 in 1:SQ[2]
 				z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 				Z2[z2_idx] += mats[i2, j2, 2, r] * Z3[z3_idx]
 			end
@@ -329,7 +328,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 			r, j1, j2 = @index_to_tuple(loop_idx, D, SB[1], SB[2])
 			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
 			Z1[z1_idx] = 0.0
-			for i1 in 1:SQ[1]
+			@inbounds for i1 in 1:SQ[1]
 				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
 				Z1[z1_idx] += mats[i1, j1, 1, r] * Z2[z2_idx]
 			end
@@ -347,9 +346,7 @@ function gpu_mul_v2!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 			j2 = I[2]
 			id = ids[i]
 			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
-			if id > 0
-				CUDA.@atomic y[id] += Z1[z1_idx]
-			end
+			CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0)
 			loop_idx += tidx_step
 		end
 		CUDA.sync_threads()
@@ -365,10 +362,11 @@ end
 """
   SUMFAC-GPU Kernel v3
   Fourth version. Reordering of the matrices to obtain better memory access patterns.
+	Matrices are still in ConstantMemory.
 	 - We process blockDim().y cells at the same time.
 	 - We use blockDim().x threads to process each cell.
 """
-function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats) where {D, SB, SQ}
+function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats,dof_map) where {D, SB, SQ}
 	dof_map = m.dof_map
 	CUDA.Const(ij_mats)
   CUDA.Const(ji_mats)
@@ -392,13 +390,10 @@ function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 		s = D * SB[1] * SB[2]
 		while loop_idx <= s
 			r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2])
-			I = dof_map[i]
-			j1 = I[1]
-			j2 = I[2]
+			I = dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2])
 			id = ids[i]
 			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
-
-			Z1[z1_idx] = x[max(id, 1)] * (id > 0)
+			Z1[z1_idx] = x[abs(id)] * (id > 0)
 			loop_idx += tidx_step
 		end
 		CUDA.sync_threads()
@@ -481,14 +476,10 @@ function gpu_mul_v3!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 		s = D * SB[1] * SB[2]
 		while loop_idx <= s
 			r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2])
-			I = dof_map[i]
-			j1 = I[1]
-			j2 = I[2]
+			I = dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2])
 			id = ids[i]
 			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
-			if id > 0
-				CUDA.@atomic y[id] += Z1[z1_idx]
-			end
+			CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0)
 			loop_idx += tidx_step
 		end
 		CUDA.sync_threads()
@@ -636,4 +627,442 @@ function gpu_mul_v4!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids,
 	end
 
 	return
+end
+
+
+"""
+  SUMFAC-GPU Kernel v5
+  Try to reorder some of the indexes to obtain better accesses to SharedMemory. 
+	Matrices still in ConstantMemory. 
+	 - We process blockDim().y cells at the same time.
+	 - We use blockDim().x threads to process each cell.
+"""
+function gpu_mul_v5!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq, ij_mats, ji_mats, dof_map) where {D, SB, SQ}
+	CUDA.Const(ij_mats)
+  CUDA.Const(ji_mats)
+  CUDA.Const(dof_map)
+  tidy = threadIdx().y
+	tidx = threadIdx().x
+	tidx_step  = blockDim().x
+	
+  s1 = blockDim().y*D*SB[1]*SB[2]; s2 = blockDim().y*D*SQ[1]*SB[2]; s3 = blockDim().y*D*SQ[1]*SQ[2];
+  Z  = @cuDynamicSharedMem(Float64,max(s1,s3)+s2)
+  Z1 = view(Z,1:s1)
+  Z2 = view(Z,max(s1,s3)+1:s1+s2)
+  Z3 = view(Z,1:s3)
+
+	cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y
+	while cell <= nCells
+		# Scatter
+		ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1)
+
+		loop_idx = tidx
+		s = SB[1] * SB[2]
+		while loop_idx <= s
+			i   = dof_map[loop_idx]
+			id  = ids[i]
+			val = x[abs(id)] * (id > 0)
+			@inbounds for r in 1:D
+				z1_idx = (tidy - 1) * s * D + (loop_idx-1)*D + r
+				Z1[z1_idx] = val
+			end
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Forward pass
+		loop_idx = tidx
+		s = D * SQ[1] * SB[2]
+		while loop_idx <= s
+			r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2])
+			val = 0.0
+			@inbounds for j1 in 1:SB[1]
+				z1_idx = (tidy - 1) * SB[2] * SB[1] * D + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
+				val += ji_mats[j1,r,i1,1] * Z1[z1_idx]
+			end
+			z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+			Z2[z2_idx] = val
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		loop_idx = tidx
+		s = D * SQ[1] * SQ[2]
+		while loop_idx <= s
+			r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2])
+			val = 0.0
+			@inbounds for j2 in 1:SB[2]
+				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+				val += ji_mats[j2,r,i2,2] * Z2[z2_idx]
+			end
+			z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+			Z3[z3_idx] = val
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Apply weights 
+		loop_idx = tidx
+		s = D * SQ[1] * SQ[2]
+		while loop_idx <= s
+			r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2])
+			idx = (i2 - 1) * SQ[1] + i1
+			z3_idx = (tidy - 1) * s + (idx - 1) * D + r
+			Z3[z3_idx] *= wq[idx]
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Backward pass
+		loop_idx = tidx
+		s = D * SQ[1] * SB[2]
+		while loop_idx <= s
+			r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2])
+			val = 0.0
+			@inbounds for i2 in 1:SQ[2]
+				z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+				val += ij_mats[i2,r,j2,2] * Z3[z3_idx]
+			end
+			z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+			Z2[z2_idx] = val
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		loop_idx = tidx
+		s = D * SB[1] * SB[2]
+		while loop_idx <= s
+			r, j1, j2 = @index_to_tuple(loop_idx, D, SB[1], SB[2])
+			val = 0.0
+			@inbounds for i1 in 1:SQ[1]
+				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r
+				val += ij_mats[i1,r,j1,1] * Z2[z2_idx]
+			end
+			z1_idx = (tidy - 1) * s + (j2 - 1) * SB[1] * D + (j1 - 1) * D + r
+			Z1[z1_idx] = val
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Assemble
+		loop_idx = tidx
+		s = SB[1] * SB[2]
+		while loop_idx <= s
+			i  = dof_map[loop_idx]
+			id = ids[i]
+			val = 0.0
+			@inbounds for r in 1:D
+				z1_idx = ((tidy - 1) * s + loop_idx-1) * D + r
+				val += Z1[z1_idx]
+			end
+			CUDA.@atomic y[abs(id)] += val * (id > 0)
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		cell += gridDim().x * blockDim().y
+	end
+
+	return
+end
+
+
+
+function gpu_mul_v6!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq,ij_mats,ji_mats) where {D, SB, SQ}
+	dof_map = m.dof_map
+  #CUDA.Const(mats)
+  #CUDA.Const(dof_map)
+  tidy = threadIdx().y
+	tidx = threadIdx().x
+	tidx_step  = blockDim().x
+	
+  s13 = blockDim().y*D*max(SB[1]*SB[2],SQ[1]*SQ[2])
+	s2 = blockDim().y*D*SQ[1]*SB[2]
+	smats = D*D*SB[1]*SQ[1]
+
+  Z  = @cuDynamicSharedMem(Float64,s13+s2+2*smats)
+  Z1 = view(Z,1:s13)
+  Z2 = view(Z,s13+1:s13+s2)
+  Z3 = view(Z,1:s13)
+
+	# Bring matrices to shared memory
+	f_mats = view(Z,s13+s2+1:s13+s2+smats)
+	b_mats = view(Z,s13+s2+smats+1:s13+s2+2*smats)
+	loop_idx = (tidy-1) * tidx_step + tidx
+	while loop_idx <= smats
+		f_mats[loop_idx] = ji_mats[loop_idx]
+		b_mats[loop_idx] = ij_mats[loop_idx]
+		loop_idx += tidx_step*blockDim().y
+	end
+	CUDA.sync_threads()
+
+	cell = (blockIdx().x - 1) * blockDim().y + threadIdx().y
+	while cell <= nCells
+		# Scatter
+		ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1)
+
+		loop_idx = tidx
+		s = D * SB[1] * SB[2]
+		while loop_idx <= s
+			r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2])
+			I = dof_map[i]; j1 = I[1]; j2 = I[2]
+			id = ids[i]
+			z1_idx = (tidy - 1) * s + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r
+
+			Z1[z1_idx] = x[abs(id)] * (id > 0)
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Forward pass
+		loop_idx = tidx
+		s = D * SQ[1] * SB[2]
+		while loop_idx <= s
+			r, j2, i1 = @index_to_tuple(loop_idx, D, SB[2], SQ[1])
+
+			z2_idx = (tidy - 1) * s + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Not coalesced
+			Z2[z2_idx] = 0.0
+			@inbounds for j1 in 1:SB[1]
+				z1_idx = (tidy - 1) * SB[1] * SB[2] * D + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r # Coalesced
+				#fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r
+				bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r
+				Z2[z2_idx] += f_mats[bm_idx] * Z1[z1_idx]
+			end
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		loop_idx = tidx
+		s = D * SQ[1] * SQ[2]
+		while loop_idx <= s
+			r, i1, i2 = @index_to_tuple(loop_idx, D, SQ[1], SQ[2])
+
+			z3_idx = (tidy - 1) * s + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced
+			Z3[z3_idx] = 0.0
+			@inbounds for j2 in 1:SB[2]
+				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (j2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced
+				#fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r
+				bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r
+				Z3[z3_idx] += f_mats[bm_idx] * Z2[z2_idx]
+			end
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Apply weights 
+		loop_idx = tidx
+		s = D * SQ[1] * SQ[2]
+		while loop_idx <= s
+			r, idx = @index_to_tuple(loop_idx, D, SQ[1]*SQ[2])
+			z3_idx = (tidy - 1) * s + loop_idx
+			Z3[z3_idx] *= wq[idx]
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Backward pass
+		loop_idx = tidx
+		s = D * SQ[1] * SB[2]
+		while loop_idx <= s
+			r, i1, j2 = @index_to_tuple(loop_idx, D, SQ[1], SB[2])
+			z2_idx = (tidy - 1) * s + (i1 - 1) * SB[2] * D + (j2 - 1) * D + r # Not coalesced
+			Z2[z2_idx] = 0.0
+			@inbounds for i2 in 1:SQ[2]
+				z3_idx = (tidy - 1) * SQ[2] * SQ[1] * D + (i2 - 1) * SQ[1] * D + (i1 - 1) * D + r # Coalesced
+				#bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r
+				fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r
+				Z2[z2_idx] += b_mats[fm_idx] * Z3[z3_idx]
+			end
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		loop_idx = tidx
+		s = D * SB[1] * SB[2]
+		while loop_idx <= s
+			r, j2, j1 = @index_to_tuple(loop_idx, D, SB[2], SB[1])
+			z1_idx = (tidy - 1) * s + loop_idx
+			Z1[z1_idx] = 0.0
+			@inbounds for i1 in 1:SQ[1]
+				z2_idx = (tidy - 1) * SB[2] * SQ[1] * D + (i1 - 1) * SB[2] * D + (j2 - 1) * D + r # Coalesced
+				#bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r
+				fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r
+				Z1[z1_idx] += b_mats[fm_idx] * Z2[z2_idx]
+			end
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		# Assemble
+		loop_idx = tidx
+		s = D * SB[1] * SB[2]
+		while loop_idx <= s
+			r, i = @index_to_tuple(loop_idx, D, SB[1] * SB[2])
+			I = dof_map[i]
+			j1 = I[1]
+			j2 = I[2]
+			id = ids[i]
+			z1_idx = (tidy - 1) * s + (j1 - 1) * SB[2] * D + (j2 - 1) * D + r
+			CUDA.@atomic y[abs(id)] += Z1[z1_idx] * (id > 0)
+			loop_idx += tidx_step
+		end
+		CUDA.sync_threads()
+
+		cell += gridDim().x * blockDim().y
+	end
+
+	return
+end
+
+
+@generated function gpu_mul_v7!(m::SumFactorizationMap{D, SB, SQ}, nCells, y, x, cell_ids, wq,ij_mats,ji_mats,_dof_map,::Val{blockdims}) where {D, SB, SQ,blockdims}
+	s13   = blockdims[2]*D*max(SB[1]*SB[2],SQ[1]*SQ[2])
+	s2    = blockdims[2]*D*SQ[1]*SB[2]
+	smats = D*D*SB[1]*SQ[1]
+	smaps = prod(SB)
+	
+	return quote 
+		tidy = threadIdx().y
+		tidx = threadIdx().x
+
+		Z  = @cuStaticSharedMem(Float64,$(s13+s2+2*smats))
+		Z1 = view(Z,1:$(s13))
+		Z2 = view(Z,$(s13+1):$(s13+s2))
+		Z3 = view(Z,1:$(s13))
+
+		# Bring matrices to shared memory
+		f_mats = view(Z,$(s13+s2+1):$(s13+s2+smats))
+		b_mats = view(Z,$(s13+s2+smats+1):$(s13+s2+2*smats))
+		loop_idx = (tidy-1) * $(blockdims[1]) + tidx
+		while loop_idx <= $(smats)
+			f_mats[loop_idx] = ij_mats[loop_idx]
+			b_mats[loop_idx] = ji_mats[loop_idx]
+			loop_idx += $(blockdims[1]*blockdims[2])
+		end
+
+		dof_map = @cuStaticSharedMem(Int32,$smaps)
+		loop_idx = (tidy-1) * $(blockdims[1]) + tidx
+		while loop_idx <= $(smaps)
+			dof_map[loop_idx] = _dof_map[loop_idx]
+			loop_idx += $(blockdims[1]*blockdims[2])
+		end
+
+		CUDA.sync_threads()
+
+		cell = (blockIdx().x - 1) * $(blockdims[2]) + threadIdx().y
+		while cell <= nCells
+			# Scatter
+			ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1)
+
+			loop_idx = tidx
+			while loop_idx <= $(SB[1] * SB[2])
+				id = ids[dof_map[loop_idx]]
+				xi = x[abs(id)] * (id > 0)
+				@inbounds for r in 1:$D
+					z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r
+					Z1[z1_idx] = xi
+				end
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			# Forward pass
+			loop_idx = tidx
+			while loop_idx <= $(D * SQ[1] * SB[2])
+				r, j2, i1 = @index_to_tuple(loop_idx, D, SB[2], SQ[1])
+
+				val = 0.0
+				@inbounds for j1 in 1:$(SB[1])
+					z1_idx = (tidy - 1) * $(SB[1] * SB[2] * D) + (j1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Coalesced
+					fm_idx = (j1 - 1) * SQ[1] * D + (i1 - 1) * D + r
+					#bm_idx = (i1 - 1) * $(SB[1] * D) + (j1 - 1) * $D + r
+					val += f_mats[fm_idx] * Z1[z1_idx]
+				end
+				z2_idx = (tidy - 1) * $(D * SQ[1] * SB[2]) + (j2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Not coalesced
+				Z2[z2_idx] = val
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			loop_idx = tidx
+			while loop_idx <= $(D * SQ[1] * SQ[2])
+				r, i1, i2 = @index_to_tuple(loop_idx, $D, $(SQ[1]), $(SQ[2]))
+
+				val = 0.0
+				@inbounds for j2 in 1:$(SB[2])
+					z2_idx = (tidy - 1) * $(SB[2] * SQ[1] * D) + (j2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced
+					fm_idx = SB[2]*SQ[2]*D + (j2 - 1) * SQ[2] * D + (i2 - 1) * D + r
+					#bm_idx = $(SQ[2]*SB[2]*D) + (i2 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r
+					val += f_mats[fm_idx] * Z2[z2_idx]
+				end
+				z3_idx = (tidy - 1) * $(D * SQ[1] * SQ[2]) + (i2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced
+				Z3[z3_idx] = val
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			# Apply weights 
+			loop_idx = tidx
+			while loop_idx <= $(D * SQ[1] * SQ[2])
+				r, idx = @index_to_tuple(loop_idx, $D, $(SQ[1]*SQ[2]))
+				z3_idx = (tidy - 1) * $(D * SQ[1] * SQ[2]) + loop_idx
+				Z3[z3_idx] *= wq[idx]
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			# Backward pass
+			loop_idx = tidx
+			while loop_idx <= $(D * SQ[1] * SB[2])
+				r, i1, j2 = @index_to_tuple(loop_idx, $D, $(SQ[1]), $(SB[2]))
+				
+				val = 0.0
+				@inbounds for i2 in 1:$(SQ[2])
+					z3_idx = (tidy - 1) * $(SQ[2] * SQ[1] * D) + (i2 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r # Coalesced
+					bm_idx = SQ[2]*SB[2]*D + (i2 - 1) * SB[2] * D + (j2 - 1) * D + r
+					#fm_idx = $(SB[2]*SQ[2]*D) + (j2 - 1) * $(SQ[2] * D) + (i2 - 1) * $D + r
+					val += b_mats[bm_idx] * Z3[z3_idx]
+				end
+				z2_idx = (tidy - 1) * $(D * SQ[1] * SB[2]) + (i1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Not coalesced
+				Z2[z2_idx] = val
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			loop_idx = tidx
+			while loop_idx <= $(D * SB[1] * SB[2])
+				r, j2, j1 = @index_to_tuple(loop_idx, $D, $(SB[2]), $(SB[1]))
+				
+				val = 0.0
+				@inbounds for i1 in 1:$(SQ[1])
+					z2_idx = (tidy - 1) * $(SB[2] * SQ[1] * D) + (i1 - 1) * $(SB[2] * D) + (j2 - 1) * $D + r # Coalesced
+					bm_idx = (i1 - 1) * SB[1] * D + (j1 - 1) * D + r
+					#fm_idx = (j1 - 1) * $(SQ[1] * D) + (i1 - 1) * $D + r
+					val += b_mats[bm_idx] * Z2[z2_idx]
+				end
+				z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + loop_idx
+				Z1[z1_idx] = val
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			# Assemble
+			loop_idx = tidx
+			while loop_idx <= $(SB[1] * SB[2])
+				id = ids[dof_map[loop_idx]]
+				yi = 0.0
+				@inbounds for r in 1:D
+					z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r
+					yi += Z1[z1_idx]
+				end
+				CUDA.@atomic y[abs(id)] += yi * (id > 0)
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
+
+			cell += gridDim().x * blockDim().y
+		end
+
+		return
+	end
 end
\ No newline at end of file
diff --git a/src/NCIHackaton2023.jl b/src/NCIHackaton2023.jl
index 12e222c..f2cf126 100644
--- a/src/NCIHackaton2023.jl
+++ b/src/NCIHackaton2023.jl
@@ -31,6 +31,9 @@ export gpu_mul_v1!
 export gpu_mul_v2!
 export gpu_mul_v3!
 export gpu_mul_v4!
+export gpu_mul_v5!
+export gpu_mul_v6!
+export gpu_mul_v7!
 
 export count_manual_flops_poisson_matrix_free
 
diff --git a/src/SumFactorizationMaps.jl b/src/SumFactorizationMaps.jl
index 84f4bb9..5a3756e 100644
--- a/src/SumFactorizationMaps.jl
+++ b/src/SumFactorizationMaps.jl
@@ -28,7 +28,8 @@ function to_gpu(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
 		mats[:, :, r, k] = m.mats[2][r, k]
 	end
 	gpu_mats = CuArray(mats)
-	return SumFactorizationMap{D, SB, SQ}(gpu_mats, CuArray(m.dof_map))
+	D_32 = Int32(D); SB_32 = Tuple(map(Int32,SB)); SQ_32 = Tuple(map(Int32,SQ))
+	return SumFactorizationMap{D_32, SB_32, SQ_32}(gpu_mats, CuArray(m.dof_map))
 end
 
 function _get_terms(poly::Polytope, orders)
diff --git a/test/GPU_v0.jl b/test/GPU_v0.jl
index bfad0ae..0f3c4b8 100644
--- a/test/GPU_v0.jl
+++ b/test/GPU_v0.jl
@@ -79,14 +79,13 @@ mul!(y_ref, A_lazy, x_ref)
 cpu_y = Array(y)
 cpu_y ≈ y_ref
 
+# Benchmark
+niter = 100
+time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
+
 # Profile
 CUDA.@profile begin
 	for iter in 1:10
 		CUDA.@sync kernel(kernel_args...; config...)
 	end
-end
-
-# Benchmark
-niter = 100
-time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
-
+end
\ No newline at end of file
diff --git a/test/GPU_v1.jl b/test/GPU_v1.jl
index a2cfdf1..dec75a9 100644
--- a/test/GPU_v1.jl
+++ b/test/GPU_v1.jl
@@ -16,8 +16,8 @@ using NCIHackaton2023
 
 # Parameters
 D           = 2                    # Problem dimension
-fe_orders   = Tuple(fill(1, D))    # FE element orders
-quad_orders = Tuple(fill(4, D))    # Quadrature orders 
+fe_orders   = Tuple(fill(4, D))    # FE element orders
+quad_orders = Tuple(fill(6, D))    # Quadrature orders 
 
 # Setup
 n         = 512
@@ -47,8 +47,8 @@ A_lazy = LazyMatrix(m, U, V, dΩ)
 ############################################################################################
 # GPU implementation
 nCells = num_cells(Ω)
-nt = 4 * 192
-nb = 80
+nt = 16*56
+nb = 320
 
 gpu_m = to_gpu(m)
 gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U));
@@ -71,7 +71,7 @@ kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, gpu_Zk...)
 kernel = @cuda name = "gpu_mul_v1" launch = false gpu_mul_v1!(kernel_args...);
 config = launch_configuration(kernel.fun)
 
-config = (threads = (32,20), blocks = 80)
+config = (threads = (16,56), blocks = 320)
 kernel(kernel_args...; config...)
 
 y_ref = zeros(length(b))
@@ -80,13 +80,12 @@ mul!(y_ref, A_lazy, x_ref)
 cpu_y = Array(y)
 cpu_y ≈ y_ref
 
-# Profile
+# Benchmark
+niter = 100
+time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
+
 CUDA.@profile begin
 	for iter in 1:10
 		CUDA.@sync kernel(kernel_args...; config...)
 	end
 end
-
-# Benchmark
-niter = 100
-time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
diff --git a/test/GPU_v2.jl b/test/GPU_v2.jl
index 09cb8f9..53ece57 100644
--- a/test/GPU_v2.jl
+++ b/test/GPU_v2.jl
@@ -56,6 +56,17 @@ gpu_wq = CuArray(cell_wq.value)
 gpu_jq = CuArray(cell_jq.value)
 gpu_djq = CuArray(cell_djq.value)
 
+function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	ji_mats = zeros((D,SB[1],SQ[1],D))
+	ij_mats = zeros((D,SQ[1],SB[1],D))
+	for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1]
+		ji_mats[r,j,i,k] = m.mats[2][k,r][i,j]
+		ij_mats[r,i,j,k] = m.mats[2][k,r][i,j]
+	end
+	return CuArray(ij_mats), CuArray(ji_mats)
+end
+ij_mats,ji_mats = get_mats(m)
+
 # Caches
 D, SB, SQ = get_dimensional_parameters(m)
 
@@ -63,13 +74,13 @@ D, SB, SQ = get_dimensional_parameters(m)
 x_ref = ones(size(b))
 x = CuArray(x_ref)
 y = CuArray(zeros(size(b)))
-kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq)
+kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq,ij_mats,ji_mats)
 
-kernel = @cuda name = "gpu_mul_v2" launch = false gpu_mul_v2!(kernel_args...);
+kernel = @cuda name = "gpu_mul_v2" launch = false gpu_mul_v6!(kernel_args...);
 config = launch_configuration(kernel.fun)
 
-mem = 16*D*(prod(SB) + SQ[1]*SB[2] + prod(SQ))*sizeof(Float64)
-config = (threads=(32,16),blocks=160,shmem=mem)
+mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64)
+config = (threads=(8,64),blocks=320,shmem=mem)
 kernel(kernel_args...;config...)
 
 y_ref = zeros(length(b))
@@ -80,3 +91,9 @@ cpu_y ≈ y_ref
 
 niter = 100
 time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
+
+CUDA.@profile begin
+	for iter in 1:10
+		CUDA.@sync kernel(kernel_args...; config...)
+	end
+end
diff --git a/test/GPU_v3.jl b/test/GPU_v3.jl
index ad2c2b0..cfd9df4 100644
--- a/test/GPU_v3.jl
+++ b/test/GPU_v3.jl
@@ -1,6 +1,6 @@
 """
-  SUMFAC-GPU v2
-  Basic kernel. Starting to take advantage of Shared & Constant memory. 
+  SUMFAC-GPU v3
+  Tackling the issue of memory access coalescence. 
 """
 
 using Test
@@ -20,7 +20,7 @@ fe_orders   = Tuple(fill(4, D))    # FE element orders
 quad_orders = Tuple(fill(6, D))    # Quadrature orders 
 
 # Setup
-n         = 1024
+n         = 512
 domain    = repeat([0, 1], D)
 partition = fill(n, D)
 model     = CartesianDiscreteModel(domain, partition)
@@ -53,6 +53,16 @@ function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
 	return CuArray(ij_mats), CuArray(ji_mats)
 end
 
+function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	dof_map = Vector{Int32}(undef,prod(SB))
+	for i in 1:prod(SB)
+		I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2])
+		idx = (j2 - 1) * SB[1] + j1
+		dof_map[idx] = i 
+	end
+	return CuArray(dof_map)
+end
+
 ############################################################################################
 ############################################################################################
 # GPU implementation
@@ -61,6 +71,7 @@ nCells = num_cells(Ω)
 gpu_m = to_gpu(m)
 gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U));
 ij_mats, ji_mats = get_mats(m)
+dof_map = get_dof_map(m)
 
 cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities
 gpu_wq = CuArray(cell_wq.value)
@@ -68,24 +79,24 @@ gpu_jq = CuArray(cell_jq.value)
 gpu_djq = CuArray(cell_djq.value)
 
 # Caches
-D, SB, SQ = get_dimensional_parameters(m)
+D, SB, SQ = get_dimensional_parameters(gpu_m)
 
 # Comparison CPU vs GPU
 x_ref = ones(size(b))
 x = CuArray(x_ref)
 y = CuArray(zeros(size(b)))
-kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats)
+kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats, dof_map)
 
 kernel = @cuda name = "gpu_mul_v3" launch = false gpu_mul_v3!(kernel_args...);
 config = launch_configuration(kernel.fun)
 
-mem = 16*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64)
-config = (threads=(16,16),blocks=320,shmem=mem)
+mem = 32*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64)
+config = (threads=(8,32),blocks=320,shmem=mem)
 kernel(kernel_args...;config...)
 
 y_ref = zeros(length(b))
 @elapsed mul!(y_ref, A_lazy, x_ref)
-@elapsed mul!(y_ref, A, x_ref)
+#@elapsed mul!(y_ref, A, x_ref)
 
 cpu_y = Array(y)
 cpu_y ≈ y_ref
diff --git a/test/GPU_v5.jl b/test/GPU_v5.jl
new file mode 100644
index 0000000..97abe53
--- /dev/null
+++ b/test/GPU_v5.jl
@@ -0,0 +1,111 @@
+"""
+  SUMFAC-GPU v3/v5
+  Tackling the issue of memory access coalescence. 
+"""
+
+using Test
+using LinearAlgebra
+
+using Gridap
+using Gridap.TensorValues
+using Gridap.Arrays
+
+using CUDA
+using Adapt
+using NCIHackaton2023
+
+# Parameters
+D           = 2                    # Problem dimension
+fe_orders   = Tuple(fill(4, D))    # FE element orders
+quad_orders = Tuple(fill(6, D))    # Quadrature orders 
+
+# Setup
+n         = 512
+domain    = repeat([0, 1], D)
+partition = fill(n, D)
+model     = CartesianDiscreteModel(domain, partition)
+
+Ω  = Triangulation(model)
+dΩ = Measure(Ω, quad_orders[1])
+
+g(x)  = 0.0
+reffe = ReferenceFE(lagrangian, Float64, fe_orders)
+V     = FESpace(Ω, reffe; dirichlet_tags = ["boundary"])
+U     = TrialFESpace(V, g)
+
+# Assembled matrix
+f(x) = 1.0
+op = AffineFEOperator((u, v) -> ∫(∇(u) ⋅ ∇(v))dΩ, v -> ∫(v * f)dΩ, U, V)
+A = get_matrix(op)
+b = get_vector(op)
+
+# Sum-Factorization based matrix
+m = SumFactorizationMap(D, fe_orders, quad_orders)
+A_lazy = LazyMatrix(m, U, V, dΩ)
+
+function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	ji_mats = zeros((SB[1],D,SQ[1],D))
+	ij_mats = zeros((SQ[1],D,SB[1],D))
+	for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1]
+		ji_mats[j,r,i,k] = m.mats[2][k,r][i,j]
+		ij_mats[i,r,j,k] = m.mats[2][k,r][i,j]
+	end
+	return CuArray(ij_mats), CuArray(ji_mats)
+end
+
+function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	dof_map = Vector{Int32}(undef,prod(SB))
+	for i in 1:prod(SB)
+		I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2])
+		idx = (j2 - 1) * SB[1] + j1
+		dof_map[idx] = i 
+	end
+	return CuArray(dof_map)
+end
+
+############################################################################################
+############################################################################################
+# GPU implementation
+nCells = num_cells(Ω)
+
+gpu_m = to_gpu(m)
+gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U));
+ij_mats, ji_mats = get_mats(m)
+dof_map = get_dof_map(m)
+
+cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities
+gpu_wq = CuArray(cell_wq.value)
+gpu_jq = CuArray(cell_jq.value)
+gpu_djq = CuArray(cell_djq.value)
+
+# Caches
+D, SB, SQ = get_dimensional_parameters(gpu_m)
+
+# Comparison CPU vs GPU
+x_ref = ones(size(b))
+x = CuArray(x_ref)
+y = CuArray(zeros(size(b)))
+kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq, ij_mats, ji_mats, dof_map)
+
+kernel = @cuda name = "gpu_mul_v5" launch = false gpu_mul_v5!(kernel_args...);
+config = launch_configuration(kernel.fun)
+
+mem = 32*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64)
+config = (threads=(8,32),blocks=320,shmem=mem)
+kernel(kernel_args...;config...)
+
+y_ref = zeros(length(b))
+@elapsed mul!(y_ref, A_lazy, x_ref)
+#@elapsed mul!(y_ref, A, x_ref)
+
+cpu_y = Array(y)
+cpu_y ≈ y_ref
+
+niter = 100
+time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
+
+CUDA.@profile begin
+	for iter in 1:10
+		CUDA.@sync kernel(kernel_args...; config...)
+	end
+end
diff --git a/test/GPU_v7.jl b/test/GPU_v7.jl
new file mode 100644
index 0000000..4cc8983
--- /dev/null
+++ b/test/GPU_v7.jl
@@ -0,0 +1,111 @@
+"""
+  SUMFAC-GPU v2
+  Basic kernel. Starting to take advantage of Shared & Constant memory. 
+"""
+
+using Test
+using LinearAlgebra
+
+using Gridap
+using Gridap.TensorValues
+using Gridap.Arrays
+
+using CUDA
+using Adapt
+using NCIHackaton2023
+
+# Parameters
+D           = 2                    # Problem dimension
+fe_orders   = Tuple(fill(4, D))    # FE element orders
+quad_orders = Tuple(fill(6, D))    # Quadrature orders 
+
+# Setup
+n         = 512
+domain    = repeat([0, 1], D)
+partition = fill(n, D)
+model     = CartesianDiscreteModel(domain, partition)
+
+Ω  = Triangulation(model)
+dΩ = Measure(Ω, quad_orders[1])
+
+g(x)  = 0.0
+reffe = ReferenceFE(lagrangian, Float64, fe_orders)
+V     = FESpace(Ω, reffe; dirichlet_tags = ["boundary"])
+U     = TrialFESpace(V, g)
+
+# Assembled matrix
+f(x) = 1.0
+#op = AffineFEOperator((u, v) -> ∫(∇(u) ⋅ ∇(v))dΩ, v -> ∫(v * f)dΩ, U, V)
+#A = get_matrix(op)
+#b = get_vector(op)
+
+# Sum-Factorization based matrix
+m = SumFactorizationMap(D, fe_orders, quad_orders)
+A_lazy = LazyMatrix(m, U, V, dΩ)
+
+############################################################################################
+############################################################################################
+# GPU implementation
+nCells = num_cells(Ω)
+nDofs  = num_free_dofs(U)
+
+gpu_m = to_gpu(m)
+gpu_cell_dof_ids = to_gpu(get_cell_dof_ids(U));
+
+cell_wq, cell_jq, cell_djq = A_lazy.cell_quantities
+gpu_wq = CuArray(cell_wq.value)
+gpu_jq = CuArray(cell_jq.value)
+gpu_djq = CuArray(cell_djq.value)
+
+function get_mats(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	ji_mats = zeros((D,SB[1],SQ[1],D))
+	ij_mats = zeros((D,SQ[1],SB[1],D))
+	for r in 1:D, k in 1:D, i in 1:SQ[1], j in 1:SB[1]
+		ji_mats[r,j,i,k] = m.mats[2][k,r][i,j]
+		ij_mats[r,i,j,k] = m.mats[2][k,r][i,j]
+	end
+	return CuArray(ij_mats), CuArray(ji_mats)
+end
+ij_mats,ji_mats = get_mats(m)
+
+function get_dof_map(m::SumFactorizationMap{D, SB, SQ}) where {D, SB, SQ}
+	dof_map = Vector{Int32}(undef,prod(SB))
+	for i in 1:prod(SB)
+		I = m.dof_map[i]; j1 = Int32(I[1]); j2 = Int32(I[2])
+		idx = (j1 - 1) * SB[1] + j2
+		dof_map[idx] = i 
+	end
+	return CuArray(dof_map)
+end
+dof_map = get_dof_map(m)
+
+# Caches
+D, SB, SQ = get_dimensional_parameters(m)
+
+# Comparison CPU vs GPU
+x_ref = ones(nDofs)
+x = CuArray(x_ref)
+y = CuArray(zeros(nDofs))
+kernel_args = (gpu_m, nCells, y, x, gpu_cell_dof_ids, gpu_wq,ij_mats,ji_mats,dof_map,Val((8,64)))
+
+kernel = @cuda name = "gpu_mul_v7" launch = false gpu_mul_v7!(kernel_args...);
+config = launch_configuration(kernel.fun)
+
+mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64)
+config = (threads=(8,64),blocks=1280)#,shmem=mem)
+kernel(kernel_args...;config...)
+
+y_ref = zeros(nDofs)
+@elapsed mul!(y_ref, A_lazy, x_ref)
+
+cpu_y = Array(y)
+cpu_y ≈ y_ref
+
+niter = 200
+time = NCIHackaton2023.benchmark_kernel(kernel, config, kernel_args, niter)
+
+CUDA.@profile begin
+	for iter in 1:10
+		CUDA.@sync kernel(kernel_args...; config...)
+	end
+end
\ No newline at end of file
diff --git a/test/cdot.jl b/test/benchmark_cdot.jl
similarity index 100%
rename from test/cdot.jl
rename to test/benchmark_cdot.jl
diff --git a/test/my_test.jl b/test/my_test.jl
index 4fce094..622206d 100644
--- a/test/my_test.jl
+++ b/test/my_test.jl
@@ -17,3 +17,12 @@ end
 
 main()
 
+
+############################################################################################
+
+
+
+dof_map 
+
+
+

From 308e527565a9c1b9a9f61b410c0ec5081cb0352b Mon Sep 17 00:00:00 2001
From: JordiManyer <j.manyer@hotmail.com>
Date: Mon, 19 Jun 2023 11:29:16 +1000
Subject: [PATCH 3/4] minor changes

---
 profile_cuda_ncu.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/profile_cuda_ncu.sh b/profile_cuda_ncu.sh
index 470f358..ae80d08 100755
--- a/profile_cuda_ncu.sh
+++ b/profile_cuda_ncu.sh
@@ -14,7 +14,10 @@ module load nvidia-hpc-sdk/22.11
 export LD_LIBRARY_PATH=~/bin/julia-1.8.5/lib/julia/:$LD_LIBRARY_PATH
 
 ncu --set full -k regex:gpu_mul --target-processes all -o result_v1 julia --project=. test/GPU_v1.jl
+ncu --set full -k regex:gpu_mul --target-processes all -o result_v2 julia --project=. test/GPU_v2.jl
 ncu --set full -k regex:gpu_mul --target-processes all -o result_v3 julia --project=. test/GPU_v3.jl
+ncu --set full -k regex:gpu_mul --target-processes all -o result_v5 julia --project=. test/GPU_v5.jl
+ncu --set full -k regex:gpu_mul --target-processes all -o result_v7 julia --project=. test/GPU_v7.jl
 
 #ncu launch --trace cuda julia --project=. -e'include("test/GPU_v0.jl")'
 #ncu launch --trace cuda julia --project=. -e'include("test/GPU_v1.jl")'

From 4d02a70ffc7017f51ebc5b6e83cd96b2e452f324 Mon Sep 17 00:00:00 2001
From: JordiManyer <j.manyer@hotmail.com>
Date: Tue, 27 Jun 2023 18:15:21 +1000
Subject: [PATCH 4/4] Save progress

---
 profile_cuda_ncu.sh |  4 ++--
 src/GPUKernels.jl   | 34 +++++++++++++++++++++-------------
 test/GPU_v7.jl      |  2 +-
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/profile_cuda_ncu.sh b/profile_cuda_ncu.sh
index ae80d08..51910f0 100755
--- a/profile_cuda_ncu.sh
+++ b/profile_cuda_ncu.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
-#PBS -P vp91
+#PBS -P kr97
 #PBS -q gpuvolta
 #PBS -l walltime=01:00:00
 #PBS -l ncpus=12
 #PBS -l ngpus=1
 #PBS -l mem=90GB
 #PBS -l jobfs=10GB
-#PBS -l storage=scratch/vp91
+#PBS -l storage=scratch/kr97
 
 module load cuda
 module load nvidia-hpc-sdk/22.11
diff --git a/src/GPUKernels.jl b/src/GPUKernels.jl
index 8b0b45c..81ef25e 100644
--- a/src/GPUKernels.jl
+++ b/src/GPUKernels.jl
@@ -925,21 +925,13 @@ end
 		tidy = threadIdx().y
 		tidx = threadIdx().x
 
-		Z  = @cuStaticSharedMem(Float64,$(s13+s2+2*smats))
+		Z  = @cuStaticSharedMem(Float64,$(s13+s2))
 		Z1 = view(Z,1:$(s13))
 		Z2 = view(Z,$(s13+1):$(s13+s2))
 		Z3 = view(Z,1:$(s13))
 
 		# Bring matrices to shared memory
-		f_mats = view(Z,$(s13+s2+1):$(s13+s2+smats))
-		b_mats = view(Z,$(s13+s2+smats+1):$(s13+s2+2*smats))
-		loop_idx = (tidy-1) * $(blockdims[1]) + tidx
-		while loop_idx <= $(smats)
-			f_mats[loop_idx] = ij_mats[loop_idx]
-			b_mats[loop_idx] = ji_mats[loop_idx]
-			loop_idx += $(blockdims[1]*blockdims[2])
-		end
-
+		ids = @cuStaticSharedMem(Int64,$smaps)
 		dof_map = @cuStaticSharedMem(Int32,$smaps)
 		loop_idx = (tidy-1) * $(blockdims[1]) + tidx
 		while loop_idx <= $(smaps)
@@ -949,14 +941,30 @@ end
 
 		CUDA.sync_threads()
 
+		f_mats = @cuStaticSharedMem(Float64,$smats)
+		#b_mats = @cuStaticSharedMem(Float64,$smats)
+		loop_idx = (tidy-1) * $(blockdims[1]) + tidx
+		while loop_idx <= $(smats)
+			f_mats[loop_idx] = ij_mats[loop_idx]
+			#b_mats[loop_idx] = ji_mats[loop_idx]
+			loop_idx += $(blockdims[1]*blockdims[2])
+		end
+		b_mats = f_mats
+
 		cell = (blockIdx().x - 1) * $(blockdims[2]) + threadIdx().y
 		while cell <= nCells
 			# Scatter
-			ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1)
+			_ids = view(cell_ids.data, cell_ids.ptrs[cell]:cell_ids.ptrs[cell+1]-1)
+			loop_idx = tidx
+			while loop_idx <= $(SB[1] * SB[2])
+				ids[loop_idx] = _ids[dof_map[loop_idx]]
+				loop_idx += blockdims[1]
+			end
+			CUDA.sync_threads()
 
 			loop_idx = tidx
 			while loop_idx <= $(SB[1] * SB[2])
-				id = ids[dof_map[loop_idx]]
+				id = ids[loop_idx] #ids[dof_map[loop_idx]]
 				xi = x[abs(id)] * (id > 0)
 				@inbounds for r in 1:$D
 					z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r
@@ -1049,7 +1057,7 @@ end
 			# Assemble
 			loop_idx = tidx
 			while loop_idx <= $(SB[1] * SB[2])
-				id = ids[dof_map[loop_idx]]
+				id = ids[loop_idx] #ids[dof_map[loop_idx]]
 				yi = 0.0
 				@inbounds for r in 1:D
 					z1_idx = (tidy - 1) * $(D * SB[1] * SB[2]) + (loop_idx - 1) * $D + r
diff --git a/test/GPU_v7.jl b/test/GPU_v7.jl
index 4cc8983..f91bcdc 100644
--- a/test/GPU_v7.jl
+++ b/test/GPU_v7.jl
@@ -92,7 +92,7 @@ kernel = @cuda name = "gpu_mul_v7" launch = false gpu_mul_v7!(kernel_args...);
 config = launch_configuration(kernel.fun)
 
 mem = 64*D*(max(prod(SB),prod(SQ)) + SQ[1]*SB[2])*sizeof(Float64) + 2*D*D*SB[1]*SQ[1]*sizeof(Float64)
-config = (threads=(8,64),blocks=1280)#,shmem=mem)
+config = (threads=(5,64),blocks=1280)#,shmem=mem)
 kernel(kernel_args...;config...)
 
 y_ref = zeros(nDofs)