From 71496214b06acd86f275a29abae40fc1b5c706da Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Tue, 3 Sep 2024 18:23:22 +0200
Subject: [PATCH 1/7] Add options to force a syncthread within a map and prefer
 the gpu_block_size variable if it is set while the auto-detected is different

---
 dace/codegen/targets/cuda.py | 22 ++++++++++++++--------
 dace/sdfg/nodes.py           |  2 ++
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index f080f2cc62..0a4f0a3a37 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1928,14 +1928,17 @@ def get_kernel_dimensions(self, dfg_scope):
 
                 # Error when both gpu_block_size and thread-block maps were defined and conflict
                 if kernelmap_entry.map.gpu_block_size is not None:
-                    raise ValueError('Both the `gpu_block_size` property and internal thread-block '
-                                     'maps were defined with conflicting sizes for kernel '
-                                     f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). '
-                                     'Use `gpu_block_size` only if you do not need access to individual '
-                                     'thread-block threads, or explicit block-level synchronization (e.g., '
-                                     '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
-                                     '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
-                                     'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
+                    block_size = kernelmap_entry.map.gpu_block_size
+
+                    warnings.warn('Both the `gpu_block_size` property and internal thread-block '
+                                   'maps were defined with conflicting sizes for kernel '
+                                   f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). '
+                                   'The block size in the and gpu_block_size will be preferred.'
+                                   'Use `gpu_block_size` only if you do not need access to individual '
+                                   'thread-block threads, or explicit block-level synchronization (e.g., '
+                                   '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
+                                   '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
+                                   'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
 
                 warnings.warn('Multiple thread-block maps with different sizes detected for '
                               f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. '
@@ -2618,6 +2621,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
 
     def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                           node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads:
+            callsite_stream.write('__syncthreads();', cfg, state_id)
+
         if node.map.schedule == dtypes.ScheduleType.GPU_Device:
             # Remove grid invocation conditions
             for i in range(len(node.map.params)):
diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 25030b595d..7661ef2bdd 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -925,6 +925,8 @@ class Map(object):
                                  optional=True,
                                  optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES)
 
+    gpu_force_syncthreads = Property(dtype=bool, desc="Force a syncthreads for the map", default=False)
+
     def __init__(self,
                  label,
                  params,

From 8efe6b5e255210f365e124210442064633012f8d Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Tue, 3 Sep 2024 18:26:16 +0200
Subject: [PATCH 2/7] Add skew option map tiling transformation

---
 dace/transformation/dataflow/tiling.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py
index bfa899e71a..8a6d75f4db 100644
--- a/dace/transformation/dataflow/tiling.py
+++ b/dace/transformation/dataflow/tiling.py
@@ -33,6 +33,8 @@ class MapTiling(transformation.SingleStateTransformation):
     divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly")
     tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1")
 
+    skew = Property(dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero")
+
     @staticmethod
     def annotates_memlets():
         return True
@@ -92,6 +94,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 stripmine.tile_stride = str(tile_stride)
                 stripmine.divides_evenly = True
                 stripmine.tile_offset = str(offset)
+                stripmine.skew = self.skew
                 stripmine.apply(graph, sdfg)
                 removed_maps += 1
             else:
@@ -101,6 +104,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG):
                 stripmine.tile_stride = str(tile_stride)
                 stripmine.divides_evenly = self.divides_evenly
                 stripmine.tile_offset = str(offset)
+                stripmine.skew = self.skew
                 stripmine.apply(graph, sdfg)
 
             # apply to the new map the schedule of the original one

From e8c134a4f5303780cfc51edacb40ab124bcf150f Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Tue, 3 Sep 2024 18:41:55 +0200
Subject: [PATCH 3/7] Improve grammar

---
 dace/sdfg/nodes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py
index 7661ef2bdd..f741de943c 100644
--- a/dace/sdfg/nodes.py
+++ b/dace/sdfg/nodes.py
@@ -925,7 +925,7 @@ class Map(object):
                                  optional=True,
                                  optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES)
 
-    gpu_force_syncthreads = Property(dtype=bool, desc="Force a syncthreads for the map", default=False)
+    gpu_force_syncthreads = Property(dtype=bool, desc="Force a call to the __syncthreads for the map", default=False)
 
     def __init__(self,
                  label,

From 675e1dc4e1508b0e605c9f511b8f31e583af2eed Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Thu, 5 Sep 2024 12:25:16 +0200
Subject: [PATCH 4/7] Incorporate step size to thread block computation

---
 dace/codegen/targets/cuda.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 0a4f0a3a37..c6234cd383 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1898,11 +1898,14 @@ def get_kernel_dimensions(self, dfg_scope):
             detected_block_sizes = [block_size] if block_size is not None else []
             for tbmap, sym_map in tb_maps_sym_map:
                 tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]]
+                tbstepsize = [t[-1] for t in tbmap.range[::-1]]
 
                 # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32)
                 # The partial trailing thread-block is emitted as an if-condition
                 # that returns on some of the participating threads
-                tbsize = [symbolic.overapproximate(s) for s in tbsize]
+                # If ther kernel is a tiled one and the thread block map has a step size, the overapproximated
+                # threadblock size needs to be divided as each thread computes more than one
+                tbsize = [symbolic.overapproximate(symbolic.overapproximate(s)/step) for s, step in zip(tbsize, tbstepsize)]
 
                 # Linearize (flatten) rest of dimensions to third
                 if len(tbsize) > 3:
@@ -1928,17 +1931,14 @@ def get_kernel_dimensions(self, dfg_scope):
 
                 # Error when both gpu_block_size and thread-block maps were defined and conflict
                 if kernelmap_entry.map.gpu_block_size is not None:
-                    block_size = kernelmap_entry.map.gpu_block_size
-
-                    warnings.warn('Both the `gpu_block_size` property and internal thread-block '
-                                   'maps were defined with conflicting sizes for kernel '
-                                   f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). '
-                                   'The block size in the and gpu_block_size will be preferred.'
-                                   'Use `gpu_block_size` only if you do not need access to individual '
-                                   'thread-block threads, or explicit block-level synchronization (e.g., '
-                                   '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
-                                   '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
-                                   'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
+                    raise ValueError('Both the `gpu_block_size` property and internal thread-block '
+                                     'maps were defined with conflicting sizes for kernel '
+                                     f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). '
+                                     'Use `gpu_block_size` only if you do not need access to individual '
+                                     'thread-block threads, or explicit block-level synchronization (e.g., '
+                                     '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or '
+                                     '`GPU_ThreadBlock_Dynamic` schedules. For more information, see '
+                                     'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html')
 
                 warnings.warn('Multiple thread-block maps with different sizes detected for '
                               f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. '
@@ -2621,9 +2621,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
 
     def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                           node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
-        if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads:
-            callsite_stream.write('__syncthreads();', cfg, state_id)
-
         if node.map.schedule == dtypes.ScheduleType.GPU_Device:
             # Remove grid invocation conditions
             for i in range(len(node.map.params)):

From 5e21392e1cd0957f47fe3e524fdc5dab1c05fe9d Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Thu, 5 Sep 2024 14:33:28 +0200
Subject: [PATCH 5/7] The step is not really necessary for the block size
 detection if the loop starts from 0

---
 dace/codegen/targets/cuda.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index c6234cd383..9aec2418c4 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1898,14 +1898,11 @@ def get_kernel_dimensions(self, dfg_scope):
             detected_block_sizes = [block_size] if block_size is not None else []
             for tbmap, sym_map in tb_maps_sym_map:
                 tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]]
-                tbstepsize = [t[-1] for t in tbmap.range[::-1]]
 
                 # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32)
                 # The partial trailing thread-block is emitted as an if-condition
                 # that returns on some of the participating threads
-                # If ther kernel is a tiled one and the thread block map has a step size, the overapproximated
-                # threadblock size needs to be divided as each thread computes more than one
-                tbsize = [symbolic.overapproximate(symbolic.overapproximate(s)/step) for s, step in zip(tbsize, tbstepsize)]
+                tbsize = [symbolic.overapproximate(s) for s, step in tbsize]
 
                 # Linearize (flatten) rest of dimensions to third
                 if len(tbsize) > 3:

From cd615940882ee6a5676d209489ac5aa02288e3a9 Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Thu, 5 Sep 2024 14:36:19 +0200
Subject: [PATCH 6/7] Fix typo

---
 dace/codegen/targets/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 9aec2418c4..f080f2cc62 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1902,7 +1902,7 @@ def get_kernel_dimensions(self, dfg_scope):
                 # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32)
                 # The partial trailing thread-block is emitted as an if-condition
                 # that returns on some of the participating threads
-                tbsize = [symbolic.overapproximate(s) for s, step in tbsize]
+                tbsize = [symbolic.overapproximate(s) for s in tbsize]
 
                 # Linearize (flatten) rest of dimensions to third
                 if len(tbsize) > 3:

From 40de8f6d9eb4b4618f7453e852df610bf7a8572c Mon Sep 17 00:00:00 2001
From: thrudprimrose <budanaz.yakup@gmail.com>
Date: Thu, 5 Sep 2024 14:44:42 +0200
Subject: [PATCH 7/7] Add gpu force syncthreads to to generate MapExit

---
 dace/codegen/targets/cuda.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index f080f2cc62..13fa881bc0 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -2618,6 +2618,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
 
     def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                           node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None:
+        if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads:
+            callsite_stream.write('__syncthreads();', cfg, state_id)
+
         if node.map.schedule == dtypes.ScheduleType.GPU_Device:
             # Remove grid invocation conditions
             for i in range(len(node.map.params)):