From 71496214b06acd86f275a29abae40fc1b5c706da Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Tue, 3 Sep 2024 18:23:22 +0200 Subject: [PATCH 1/7] Add options to force a syncthread within a map and prefer the gpu_block_size variable if it is set while the auto-detected is different --- dace/codegen/targets/cuda.py | 22 ++++++++++++++-------- dace/sdfg/nodes.py | 2 ++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f080f2cc62..0a4f0a3a37 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1928,14 +1928,17 @@ def get_kernel_dimensions(self, dfg_scope): # Error when both gpu_block_size and thread-block maps were defined and conflict if kernelmap_entry.map.gpu_block_size is not None: - raise ValueError('Both the `gpu_block_size` property and internal thread-block ' - 'maps were defined with conflicting sizes for kernel ' - f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' - 'Use `gpu_block_size` only if you do not need access to individual ' - 'thread-block threads, or explicit block-level synchronization (e.g., ' - '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' - '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' - 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + block_size = kernelmap_entry.map.gpu_block_size + + warnings.warn('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' + 'The block size in the and gpu_block_size will be preferred.' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') warnings.warn('Multiple thread-block maps with different sizes detected for ' f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. ' @@ -2618,6 +2621,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads: + callsite_stream.write('__syncthreads();', cfg, state_id) + if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)): diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 25030b595d..7661ef2bdd 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -925,6 +925,8 @@ class Map(object): optional=True, optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES) + gpu_force_syncthreads = Property(dtype=bool, desc="Force a syncthreads for the map", default=False) + def __init__(self, label, params, From 8efe6b5e255210f365e124210442064633012f8d Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Tue, 3 Sep 2024 18:26:16 +0200 Subject: [PATCH 2/7] Add skew option map tiling transformation --- dace/transformation/dataflow/tiling.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dace/transformation/dataflow/tiling.py b/dace/transformation/dataflow/tiling.py index bfa899e71a..8a6d75f4db 100644 --- a/dace/transformation/dataflow/tiling.py +++ b/dace/transformation/dataflow/tiling.py @@ -33,6 +33,8 @@ class MapTiling(transformation.SingleStateTransformation): divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") tile_trivial = Property(dtype=bool, default=False, desc="Tiles even if tile_size is 1") + skew = Property(dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero") + @staticmethod def annotates_memlets(): return True @@ -92,6 +94,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) removed_maps += 1 else: @@ -101,6 +104,7 @@ def apply(self, graph: SDFGState, sdfg: SDFG): stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) + stripmine.skew = self.skew stripmine.apply(graph, sdfg) # apply to the new map the schedule of the original one From e8c134a4f5303780cfc51edacb40ab124bcf150f Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Tue, 3 Sep 2024 18:41:55 +0200 Subject: [PATCH 3/7] Improve grammar --- dace/sdfg/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 7661ef2bdd..f741de943c 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -925,7 +925,7 @@ class Map(object): optional=True, optional_condition=lambda m: m.schedule in dtypes.GPU_SCHEDULES) - gpu_force_syncthreads = Property(dtype=bool, desc="Force a syncthreads for the map", default=False) + gpu_force_syncthreads = Property(dtype=bool, desc="Force a call to the __syncthreads for the map", default=False) def __init__(self, label, From 675e1dc4e1508b0e605c9f511b8f31e583af2eed Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Thu, 5 Sep 2024 12:25:16 +0200 Subject: [PATCH 4/7] Incorporate step size to thread block computation --- dace/codegen/targets/cuda.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 0a4f0a3a37..c6234cd383 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1898,11 +1898,14 @@ def get_kernel_dimensions(self, dfg_scope): detected_block_sizes = [block_size] if block_size is not None else [] for tbmap, sym_map in tb_maps_sym_map: tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] + tbstepsize = [t[-1] for t in tbmap.range[::-1]] # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) # The partial trailing thread-block is emitted as an if-condition # that returns on some of the participating threads - tbsize = [symbolic.overapproximate(s) for s in tbsize] + # If ther kernel is a tiled one and the thread block map has a step size, the overapproximated + # threadblock size needs to be divided as each thread computes more than one + tbsize = [symbolic.overapproximate(symbolic.overapproximate(s)/step) for s, step in zip(tbsize, tbstepsize)] # Linearize (flatten) rest of dimensions to third if len(tbsize) > 3: @@ -1928,17 +1931,14 @@ def get_kernel_dimensions(self, dfg_scope): # Error when both gpu_block_size and thread-block maps were defined and conflict if kernelmap_entry.map.gpu_block_size is not None: - block_size = kernelmap_entry.map.gpu_block_size - - warnings.warn('Both the `gpu_block_size` property and internal thread-block ' - 'maps were defined with conflicting sizes for kernel ' - f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' - 'The block size in the and gpu_block_size will be preferred.' - 'Use `gpu_block_size` only if you do not need access to individual ' - 'thread-block threads, or explicit block-level synchronization (e.g., ' - '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' - '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' - 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + raise ValueError('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') warnings.warn('Multiple thread-block maps with different sizes detected for ' f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. ' @@ -2621,9 +2621,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads: - callsite_stream.write('__syncthreads();', cfg, state_id) - if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)): From 5e21392e1cd0957f47fe3e524fdc5dab1c05fe9d Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Thu, 5 Sep 2024 14:33:28 +0200 Subject: [PATCH 5/7] The step is not really necessary for the block size detection if the loop starts from 0 --- dace/codegen/targets/cuda.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index c6234cd383..9aec2418c4 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1898,14 +1898,11 @@ def get_kernel_dimensions(self, dfg_scope): detected_block_sizes = [block_size] if block_size is not None else [] for tbmap, sym_map in tb_maps_sym_map: tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] - tbstepsize = [t[-1] for t in tbmap.range[::-1]] # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) # The partial trailing thread-block is emitted as an if-condition # that returns on some of the participating threads - # If ther kernel is a tiled one and the thread block map has a step size, the overapproximated - # threadblock size needs to be divided as each thread computes more than one - tbsize = [symbolic.overapproximate(symbolic.overapproximate(s)/step) for s, step in zip(tbsize, tbstepsize)] + tbsize = [symbolic.overapproximate(s) for s, step in tbsize] # Linearize (flatten) rest of dimensions to third if len(tbsize) > 3: From cd615940882ee6a5676d209489ac5aa02288e3a9 Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Thu, 5 Sep 2024 14:36:19 +0200 Subject: [PATCH 6/7] Fix typo --- dace/codegen/targets/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 9aec2418c4..f080f2cc62 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1902,7 +1902,7 @@ def get_kernel_dimensions(self, dfg_scope): # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) # The partial trailing thread-block is emitted as an if-condition # that returns on some of the participating threads - tbsize = [symbolic.overapproximate(s) for s, step in tbsize] + tbsize = [symbolic.overapproximate(s) for s in tbsize] # Linearize (flatten) rest of dimensions to third if len(tbsize) > 3: From 40de8f6d9eb4b4618f7453e852df610bf7a8572c Mon Sep 17 00:00:00 2001 From: thrudprimrose Date: Thu, 5 Sep 2024 14:44:42 +0200 Subject: [PATCH 7/7] Add gpu force syncthreads to to generate MapExit --- dace/codegen/targets/cuda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index f080f2cc62..13fa881bc0 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -2618,6 +2618,9 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub def _generate_MapExit(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.MapExit, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + if isinstance(node, nodes.MapExit) and node.map.gpu_force_syncthreads: + callsite_stream.write('__syncthreads();', cfg, state_id) + if node.map.schedule == dtypes.ScheduleType.GPU_Device: # Remove grid invocation conditions for i in range(len(node.map.params)):