From cb1c4c5517569b9a611e9ed8dd159bcb5188c34b Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Thu, 13 Jun 2024 09:59:19 +0000 Subject: [PATCH 01/12] SingleColumn: Move wrap_vector_section into standalone method --- loki/transformations/single_column/vector.py | 58 ++++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index b938e5359..5dce3b2e6 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -28,7 +28,7 @@ __all__ = [ 'SCCDevectorTransformation', 'SCCRevectorTransformation', - 'SCCDemoteTransformation' + 'SCCDemoteTransformation', 'wrap_vector_section' ] @@ -249,6 +249,33 @@ def process_driver(self, routine, targets=()): routine.body = Transformer(driver_loop_map).visit(routine.body) +def wrap_vector_section(section, routine, horizontal): + """ + Wrap a section of nodes in a vector-level loop across the horizontal. + + Parameters + ---------- + section : tuple of :any:`Node` + A section of nodes to be wrapped in a vector-level loop + routine : :any:`Subroutine` + The subroutine in the vector loops should be removed. + horizontal: :any:`Dimension` + The dimension specifying the horizontal vector dimension + """ + bounds = get_loop_bounds(routine, dimension=horizontal) + + # Create a single loop around the horizontal from a given body + index = get_integer_variable(routine, horizontal.index) + bounds = sym.LoopRange(bounds) + + # Ensure we clone all body nodes, to avoid recursion issues + vector_loop = ir.Loop(variable=index, bounds=bounds, body=Transformer().visit(section)) + + # Add a comment before and after the pragma-annotated loop to ensure + # we do not overlap with neighbouring pragmas + return (ir.Comment(''), vector_loop, ir.Comment('')) + + class SCCRevectorTransformation(Transformation): """ A transformation to wrap thread-parallel IR sections within a horizontal loop. @@ -265,33 +292,6 @@ def __init__(self, horizontal, remove_vector_section=False): self.horizontal = horizontal self.remove_vector_section = remove_vector_section - @classmethod - def wrap_vector_section(cls, section, routine, horizontal): - """ - Wrap a section of nodes in a vector-level loop across the horizontal. - - Parameters - ---------- - section : tuple of :any:`Node` - A section of nodes to be wrapped in a vector-level loop - routine : :any:`Subroutine` - The subroutine in the vector loops should be removed. - horizontal: :any:`Dimension` - The dimension specifying the horizontal vector dimension - """ - bounds = get_loop_bounds(routine, dimension=horizontal) - - # Create a single loop around the horizontal from a given body - index = get_integer_variable(routine, horizontal.index) - bounds = sym.LoopRange(bounds) - - # Ensure we clone all body nodes, to avoid recursion issues - vector_loop = ir.Loop(variable=index, bounds=bounds, body=Transformer().visit(section)) - - # Add a comment before and after the pragma-annotated loop to ensure - # we do not overlap with neighbouring pragmas - return (ir.Comment(''), vector_loop, ir.Comment('')) - def transform_subroutine(self, routine, **kwargs): """ Apply SCCRevector utilities to a :any:`Subroutine`. @@ -303,7 +303,7 @@ def transform_subroutine(self, routine, **kwargs): routine : :any:`Subroutine` Subroutine to apply this transformation to. """ - mapper = {s.body: self.wrap_vector_section(s.body, routine, self.horizontal) + mapper = {s.body: wrap_vector_section(s.body, routine, self.horizontal) for s in FindNodes(ir.Section).visit(routine.body) if s.label == 'vector_section'} routine.body = NestedTransformer(mapper).visit(routine.body) From cb83a00b73fb7c3ac2c3600108b032e464c63226 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Sun, 28 Jul 2024 05:20:00 +0000 Subject: [PATCH 02/12] SCC: Let SCCRevector mark vector loops and SCCAnnotate translates Instead of SCAnnotate trying to find vector loops, the routine that creates them marks them with `!$loki vector`, which SCCAnnotate then translates to OpenACC pragmas. This also uses in-place updates in SCCAnnotate now to speed up processing. --- .../transformations/single_column/annotate.py | 18 ++++----- .../single_column/tests/test_scc_vector.py | 40 ++++++++++++------- loki/transformations/single_column/vector.py | 10 ++++- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index b0fdea690..2c8d073e8 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -51,7 +51,7 @@ def __init__(self, horizontal, directive, block_dim): self.block_dim = block_dim @classmethod - def kernel_annotate_vector_loops_openacc(cls, routine, horizontal): + def kernel_annotate_vector_loops_openacc(cls, routine): """ Insert ``!$acc loop vector`` annotations around horizontal vector loops, including the necessary private variable declarations. @@ -60,8 +60,6 @@ def kernel_annotate_vector_loops_openacc(cls, routine, horizontal): ---------- routine : :any:`Subroutine` The subroutine in the vector loops should be removed. - horizontal: :any:`Dimension` - The dimension object specifying the horizontal vector dimension """ # Find any local arrays that need explicitly privatization @@ -87,21 +85,19 @@ def kernel_annotate_vector_loops_openacc(cls, routine, horizontal): loops = FindNodes(ir.Loop).visit(region) assert len(loops) == 1 pragma = ir.Pragma(keyword='acc', content=f'loop vector {reduction_clause[0]}') - mapper[loops[0]] = loops[0].clone(pragma=(pragma,)) - mapper[region.pragma] = None - mapper[region.pragma_post] = None + # Update loop and region in place to remove marker pragmas + loops[0]._update(pragma=(pragma,)) + region._update(pragma=None, pragma_post=None) with pragmas_attached(routine, ir.Loop): for loop in FindNodes(ir.Loop).visit(routine.body): - if loop.variable == horizontal.index and not loop in mapper: + if is_loki_pragma(loop.pragma, starts_with='loop vector'): # Construct pragma and wrap entire body in vector loop private_arrs = ', '.join(v.name for v in private_arrays) pragma = () private_clause = '' if not private_arrays else f' private({private_arrs})' pragma = ir.Pragma(keyword='acc', content=f'loop vector{private_clause}') - mapper[loop] = loop.clone(pragma=(pragma,)) - - routine.body = Transformer(mapper).visit(routine.body) + loop._update(pragma=(pragma,)) @classmethod def kernel_annotate_sequential_loops_openacc(cls, routine, horizontal, block_dim=None, ignore=()): @@ -162,7 +158,7 @@ def kernel_annotate_subroutine_present_openacc(cls, routine): def insert_annotations(cls, routine, horizontal): # Mark all parallel vector loops as `!$acc loop vector` - cls.kernel_annotate_vector_loops_openacc(routine, horizontal) + cls.kernel_annotate_vector_loops_openacc(routine) # Mark all non-parallel loops as `!$acc loop seq` cls.kernel_annotate_sequential_loops_openacc(routine, horizontal) diff --git a/loki/transformations/single_column/tests/test_scc_vector.py b/loki/transformations/single_column/tests/test_scc_vector.py index 99dbf919f..929b405b6 100644 --- a/loki/transformations/single_column/tests/test_scc_vector.py +++ b/loki/transformations/single_column/tests/test_scc_vector.py @@ -11,7 +11,7 @@ from loki.frontend import available_frontends from loki.ir import ( FindNodes, Assignment, CallStatement, Conditional, Comment, Loop, - Pragma, Section + Pragma, Section, pragmas_attached, is_loki_pragma ) from loki.transformations.single_column import ( SCCDevectorTransformation, SCCRevectorTransformation, SCCVectorPipeline @@ -102,13 +102,18 @@ def test_scc_revector_transformation(frontend, horizontal): # Ensure we have two nested loops in the kernel # (the hoisted horizontal and the native vertical) - kernel_loops = FindNodes(Loop).visit(kernel.body) - assert len(kernel_loops) == 2 - assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) - assert kernel_loops[0].variable == 'jl' - assert kernel_loops[0].bounds == 'start:end' - assert kernel_loops[1].variable == 'jk' - assert kernel_loops[1].bounds == '2:nz' + with pragmas_attached(kernel, node_type=Loop): + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 2 + assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) + assert kernel_loops[0].variable == 'jl' + assert kernel_loops[0].bounds == 'start:end' + assert kernel_loops[1].variable == 'jk' + assert kernel_loops[1].bounds == '2:nz' + + # Check internal loop pragma annotations + assert kernel_loops[0].pragma + assert is_loki_pragma(kernel_loops[0].pragma, starts_with='loop vector') # Ensure all expressions and array indices are unchanged assigns = FindNodes(Assignment).visit(kernel.body) @@ -204,13 +209,18 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ # Ensure we have two nested loops in the kernel # (the hoisted horizontal and the native vertical) - kernel_loops = FindNodes(Loop).visit(kernel.body) - assert len(kernel_loops) == 2 - assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) - assert kernel_loops[0].variable == 'jl' - assert kernel_loops[0].bounds == 'bnds%start:bnds%end' - assert kernel_loops[1].variable == 'jk' - assert kernel_loops[1].bounds == '2:nz' + with pragmas_attached(kernel, node_type=Loop): + kernel_loops = FindNodes(Loop).visit(kernel.body) + assert len(kernel_loops) == 2 + assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) + assert kernel_loops[0].variable == 'jl' + assert kernel_loops[0].bounds == 'bnds%start:bnds%end' + assert kernel_loops[1].variable == 'jk' + assert kernel_loops[1].bounds == '2:nz' + + # Check internal loop pragma annotations + assert kernel_loops[0].pragma + assert is_loki_pragma(kernel_loops[0].pragma, starts_with='loop vector') # Ensure all expressions and array indices are unchanged assigns = FindNodes(Assignment).visit(kernel.body) diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 5dce3b2e6..1b3d0bfcb 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -249,7 +249,7 @@ def process_driver(self, routine, targets=()): routine.body = Transformer(driver_loop_map).visit(routine.body) -def wrap_vector_section(section, routine, horizontal): +def wrap_vector_section(section, routine, horizontal, insert_pragma=True): """ Wrap a section of nodes in a vector-level loop across the horizontal. @@ -261,6 +261,8 @@ def wrap_vector_section(section, routine, horizontal): The subroutine in the vector loops should be removed. horizontal: :any:`Dimension` The dimension specifying the horizontal vector dimension + insert_pragma: bool, optional + Adds a ``!$loki vector`` pragma around the created loop """ bounds = get_loop_bounds(routine, dimension=horizontal) @@ -269,7 +271,11 @@ def wrap_vector_section(section, routine, horizontal): bounds = sym.LoopRange(bounds) # Ensure we clone all body nodes, to avoid recursion issues - vector_loop = ir.Loop(variable=index, bounds=bounds, body=Transformer().visit(section)) + body = Transformer().visit(section) + + # Add a marker pragma for later annotations + pragma = (ir.Pragma('loki', content='loop vector'),) if insert_pragma else None + vector_loop = ir.Loop(variable=index, bounds=bounds, body=body, pragma=pragma) # Add a comment before and after the pragma-annotated loop to ensure # we do not overlap with neighbouring pragmas From 9db0df86fabfde10e31edc5a62d990ce7f58a3f8 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Tue, 6 Aug 2024 14:18:12 +0000 Subject: [PATCH 03/12] SCC: Mark "seq loops" in SCCRevector and let SCCAnnotate translate A small bit of refactoring of the `SCCRevector` core routine also ensures that we now only detect `!$loki loop seq` loops inside driver-loops. --- .../transformations/single_column/annotate.py | 27 ++---- .../single_column/tests/test_scc_vector.py | 4 + loki/transformations/single_column/vector.py | 83 +++++++++++++++++-- 3 files changed, 88 insertions(+), 26 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 2c8d073e8..5e5cd0e1f 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -100,33 +100,23 @@ def kernel_annotate_vector_loops_openacc(cls, routine): loop._update(pragma=(pragma,)) @classmethod - def kernel_annotate_sequential_loops_openacc(cls, routine, horizontal, block_dim=None, ignore=()): + def kernel_annotate_sequential_loops_openacc(cls, routine): """ - Insert ``!$acc loop seq`` annotations around all loops that - are not horizontal vector loops. + Insert ``!$acc loop seq`` annotations for all loops previously + marked with ``!$loki loop seq``. Parameters ---------- routine : :any:`Subroutine` The subroutine in which to annotate sequential loops - horizontal: :any:`Dimension` - The dimension object specifying the horizontal vector dimension - block_dim: :any: `Dimension` - The dimension object specifying the blocking dimension - ignore: list or tuple - Loops to be ignored for annotation """ - block_dim_index = None if block_dim is None else block_dim.index with pragmas_attached(routine, ir.Loop): - for loop in FindNodes(ir.Loop).visit(routine.body): - # Skip loops explicitly marked with `!$loki/claw nodep` - if loop.pragma and any('nodep' in p.content.lower() for p in as_tuple(loop.pragma)): + if not is_loki_pragma(loop.pragma, starts_with='loop seq'): continue - if loop.variable != horizontal.index and loop.variable != block_dim_index and loop not in ignore: - # Perform pragma addition in place to avoid nested loop replacements - loop._update(pragma=(ir.Pragma(keyword='acc', content='loop seq'),)) + # Replace internal `!$loki loop seq`` pragam with `!$acc` equivalent + loop._update(pragma=(ir.Pragma(keyword='acc', content='loop seq'),)) # Warn if we detect vector insisde sequential loop nesting nested_loops = FindNodes(ir.Loop).visit(loop.body) @@ -161,7 +151,7 @@ def insert_annotations(cls, routine, horizontal): cls.kernel_annotate_vector_loops_openacc(routine) # Mark all non-parallel loops as `!$acc loop seq` - cls.kernel_annotate_sequential_loops_openacc(routine, horizontal) + cls.kernel_annotate_sequential_loops_openacc(routine) # Wrap the routine body in `!$acc data present` markers # to ensure device-resident data is used for array and struct arguments. @@ -249,8 +239,7 @@ def process_driver(self, routine, targets=None): if self.directive == 'openacc': # Mark all non-parallel loops as `!$acc loop seq` - self.kernel_annotate_sequential_loops_openacc(routine, self.horizontal, self.block_dim, - ignore=driver_loops) + self.kernel_annotate_sequential_loops_openacc(routine) # Remove the vector section wrappers # These have been inserted by SCCDevectorTransformation diff --git a/loki/transformations/single_column/tests/test_scc_vector.py b/loki/transformations/single_column/tests/test_scc_vector.py index 929b405b6..f74dd9be4 100644 --- a/loki/transformations/single_column/tests/test_scc_vector.py +++ b/loki/transformations/single_column/tests/test_scc_vector.py @@ -114,6 +114,8 @@ def test_scc_revector_transformation(frontend, horizontal): # Check internal loop pragma annotations assert kernel_loops[0].pragma assert is_loki_pragma(kernel_loops[0].pragma, starts_with='loop vector') + assert kernel_loops[1].pragma + assert is_loki_pragma(kernel_loops[1].pragma, starts_with='loop seq') # Ensure all expressions and array indices are unchanged assigns = FindNodes(Assignment).visit(kernel.body) @@ -221,6 +223,8 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ # Check internal loop pragma annotations assert kernel_loops[0].pragma assert is_loki_pragma(kernel_loops[0].pragma, starts_with='loop vector') + assert kernel_loops[1].pragma + assert is_loki_pragma(kernel_loops[1].pragma, starts_with='loop seq') # Ensure all expressions and array indices are unchanged assigns = FindNodes(Assignment).visit(kernel.body) diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 1b3d0bfcb..4ff05540e 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -298,21 +298,90 @@ def __init__(self, horizontal, remove_vector_section=False): self.horizontal = horizontal self.remove_vector_section = remove_vector_section + def revector_section(self, routine, section): + """ + Wrap all thread-parallel :any:`Section` objects within a given + code section in a horizontal loop and mark interior loops as + ``!$loki loop seq``. + + Parameters + ---------- + routine : :any:`Subroutine` + Subroutine to apply this transformation to. + section : tuple of :any:`Node` + Code section in which to replace vector-parallel + :any:`Section` objects. + """ + # Wrap all thread-parallel sections into horizontal thread loops + mapper = { + s.body: wrap_vector_section(s.body, routine, self.horizontal) + for s in FindNodes(ir.Section).visit(section) + if s.label == 'vector_section' + } + return NestedTransformer(mapper).visit(section) + + def mark_seq_loops(self, section): + """ + Mark interior sequential loops in a thread-parallel section + with ``!$loki loop seq`` for later annotation. + + This utility requires loop-pragmas to be attached via + :any:`pragmas_attached`. It also updates loops in-place. + + Parameters + ---------- + section : tuple of :any:`Node` + Code section in which to mark "seq loops". + """ + for loop in FindNodes(ir.Loop).visit(section): + + # Skip loops explicitly marked with `!$loki/claw nodep` + if loop.pragma and any('nodep' in p.content.lower() for p in as_tuple(loop.pragma)): + continue + + # Mark loop as sequential with `!$loki loop seq` + if loop.variable != self.horizontal.index: + loop._update(pragma=(ir.Pragma(keyword='loki', content='loop seq'),)) + def transform_subroutine(self, routine, **kwargs): """ - Apply SCCRevector utilities to a :any:`Subroutine`. - It wraps all thread-parallel sections within - a horizontal loop. The markers placed by :any:`SCCDevectorTransformation` are removed + Wrap vector-parallel sections in vector :any:`Loop` objects. + + This wraps all thread-parallel sections within "kernel" + routines or within the parallel loops in "driver" routines. + + The markers placed by :any:`SCCDevectorTransformation` are removed Parameters ---------- routine : :any:`Subroutine` Subroutine to apply this transformation to. + role : str + Must be either ``"kernel"`` or ``"driver"`` + targets : tuple or str + Tuple of target routine names for determining "driver" loops """ - mapper = {s.body: wrap_vector_section(s.body, routine, self.horizontal) - for s in FindNodes(ir.Section).visit(routine.body) - if s.label == 'vector_section'} - routine.body = NestedTransformer(mapper).visit(routine.body) + role = kwargs['role'] + targets = kwargs.get('targets', ()) + + if role == 'kernel': + # Revector all marked vector sections within the kernel body + routine.body = self.revector_section(routine, routine.body) + + # Mark sequential loops inside vector sections + with pragmas_attached(routine, ir.Loop): + self.mark_seq_loops(routine.body) + + if role == 'driver': + with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): + driver_loops = find_driver_loops(routine=routine, targets=targets) + + for loop in driver_loops: + # Revector all marked sections within the driver loop body + loop._update(body=self.revector_section(routine, loop.body)) + + # Mark sequential loops inside vector sections + self.mark_seq_loops(loop.body) if self.remove_vector_section: # Remove the vector section wrappers From c88afa4cb2d33d82043a0c7da0f53dd64a3d75fc Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Tue, 6 Aug 2024 14:51:25 +0000 Subject: [PATCH 04/12] SCC: Clean up imports in test_scc_vector.py --- .../single_column/tests/test_scc_vector.py | 79 ++++++++++--------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/loki/transformations/single_column/tests/test_scc_vector.py b/loki/transformations/single_column/tests/test_scc_vector.py index f74dd9be4..33a0f0c24 100644 --- a/loki/transformations/single_column/tests/test_scc_vector.py +++ b/loki/transformations/single_column/tests/test_scc_vector.py @@ -10,8 +10,7 @@ from loki import Subroutine, Sourcefile, Dimension, fgen from loki.frontend import available_frontends from loki.ir import ( - FindNodes, Assignment, CallStatement, Conditional, Comment, Loop, - Pragma, Section, pragmas_attached, is_loki_pragma + nodes as ir, FindNodes, pragmas_attached, is_loki_pragma ) from loki.transformations.single_column import ( SCCDevectorTransformation, SCCRevectorTransformation, SCCVectorPipeline @@ -91,7 +90,7 @@ def test_scc_revector_transformation(frontend, horizontal): driver = Subroutine.from_source(fcode_driver, frontend=frontend) # Ensure we have three loops in the kernel prior to transformation - kernel_loops = FindNodes(Loop).visit(kernel.body) + kernel_loops = FindNodes(ir.Loop).visit(kernel.body) assert len(kernel_loops) == 3 scc_transform = (SCCDevectorTransformation(horizontal=horizontal),) @@ -102,10 +101,10 @@ def test_scc_revector_transformation(frontend, horizontal): # Ensure we have two nested loops in the kernel # (the hoisted horizontal and the native vertical) - with pragmas_attached(kernel, node_type=Loop): - kernel_loops = FindNodes(Loop).visit(kernel.body) + with pragmas_attached(kernel, node_type=ir.Loop): + kernel_loops = FindNodes(ir.Loop).visit(kernel.body) assert len(kernel_loops) == 2 - assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) + assert kernel_loops[1] in FindNodes(ir.Loop).visit(kernel_loops[0].body) assert kernel_loops[0].variable == 'jl' assert kernel_loops[0].bounds == 'start:end' assert kernel_loops[1].variable == 'jk' @@ -118,18 +117,18 @@ def test_scc_revector_transformation(frontend, horizontal): assert is_loki_pragma(kernel_loops[1].pragma, starts_with='loop seq') # Ensure all expressions and array indices are unchanged - assigns = FindNodes(Assignment).visit(kernel.body) + assigns = FindNodes(ir.Assignment).visit(kernel.body) assert fgen(assigns[1]).lower() == 't(jl, jk) = c*jk' assert fgen(assigns[2]).lower() == 'q(jl, jk) = q(jl, jk - 1) + t(jl, jk)*c' assert fgen(assigns[3]).lower() == 'q(jl, nz) = q(jl, nz)*c' # Ensure driver remains unaffected - driver_loops = FindNodes(Loop).visit(driver.body) + driver_loops = FindNodes(ir.Loop).visit(driver.body) assert len(driver_loops) == 1 assert driver_loops[0].variable == 'b' assert driver_loops[0].bounds == '1:nb' - kernel_calls = FindNodes(CallStatement).visit(driver_loops[0]) + kernel_calls = FindNodes(ir.CallStatement).visit(driver_loops[0]) assert len(kernel_calls) == 1 assert kernel_calls[0].name == 'compute_column' @@ -200,7 +199,7 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ definitions=bnds_type_mod.definitions).subroutines[0] # Ensure we have three loops in the kernel prior to transformation - kernel_loops = FindNodes(Loop).visit(kernel.body) + kernel_loops = FindNodes(ir.Loop).visit(kernel.body) assert len(kernel_loops) == 3 scc_transform = (SCCDevectorTransformation(horizontal=horizontal_bounds_aliases),) @@ -211,10 +210,10 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ # Ensure we have two nested loops in the kernel # (the hoisted horizontal and the native vertical) - with pragmas_attached(kernel, node_type=Loop): - kernel_loops = FindNodes(Loop).visit(kernel.body) + with pragmas_attached(kernel, node_type=ir.Loop): + kernel_loops = FindNodes(ir.Loop).visit(kernel.body) assert len(kernel_loops) == 2 - assert kernel_loops[1] in FindNodes(Loop).visit(kernel_loops[0].body) + assert kernel_loops[1] in FindNodes(ir.Loop).visit(kernel_loops[0].body) assert kernel_loops[0].variable == 'jl' assert kernel_loops[0].bounds == 'bnds%start:bnds%end' assert kernel_loops[1].variable == 'jk' @@ -227,18 +226,18 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ assert is_loki_pragma(kernel_loops[1].pragma, starts_with='loop seq') # Ensure all expressions and array indices are unchanged - assigns = FindNodes(Assignment).visit(kernel.body) + assigns = FindNodes(ir.Assignment).visit(kernel.body) assert fgen(assigns[1]).lower() == 't(jl, jk) = c*jk' assert fgen(assigns[2]).lower() == 'q(jl, jk) = q(jl, jk - 1) + t(jl, jk)*c' assert fgen(assigns[3]).lower() == 'q(jl, nz) = q(jl, nz)*c' # Ensure driver remains unaffected - driver_loops = FindNodes(Loop).visit(driver.body) + driver_loops = FindNodes(ir.Loop).visit(driver.body) assert len(driver_loops) == 1 assert driver_loops[0].variable == 'b' assert driver_loops[0].bounds == '1:nb' - kernel_calls = FindNodes(CallStatement).visit(driver_loops[0]) + kernel_calls = FindNodes(ir.CallStatement).visit(driver_loops[0]) assert len(kernel_calls) == 1 assert kernel_calls[0].name == 'compute_column' @@ -294,7 +293,7 @@ def test_scc_devector_transformation(frontend, horizontal): kernel = Subroutine.from_source(fcode_kernel, frontend=frontend) # Check number of horizontal loops prior to transformation - loops = [l for l in FindNodes(Loop).visit(kernel.body) if l.variable == 'jl'] + loops = [l for l in FindNodes(ir.Loop).visit(kernel.body) if l.variable == 'jl'] assert len(loops) == 4 # Test SCCDevector transform for kernel with scope-splitting outer loop @@ -302,20 +301,23 @@ def test_scc_devector_transformation(frontend, horizontal): scc_transform.apply(kernel, role='kernel') # Check removal of horizontal loops - loops = [l for l in FindNodes(Loop).visit(kernel.body) if l.variable == 'jl'] + loops = [l for l in FindNodes(ir.Loop).visit(kernel.body) if l.variable == 'jl'] assert not loops # Check number and content of vector sections - sections = [s for s in FindNodes(Section).visit(kernel.body) if s.label == 'vector_section'] + sections = [ + s for s in FindNodes(ir.Section).visit(kernel.body) + if s.label == 'vector_section' + ] assert len(sections) == 4 - assigns = FindNodes(Assignment).visit(sections[0]) + assigns = FindNodes(ir.Assignment).visit(sections[0]) assert len(assigns) == 2 - assigns = FindNodes(Assignment).visit(sections[1]) + assigns = FindNodes(ir.Assignment).visit(sections[1]) assert len(assigns) == 1 - assigns = FindNodes(Assignment).visit(sections[2]) + assigns = FindNodes(ir.Assignment).visit(sections[2]) assert len(assigns) == 1 - assigns = FindNodes(Assignment).visit(sections[3]) + assigns = FindNodes(ir.Assignment).visit(sections[3]) assert len(assigns) == 1 @@ -363,14 +365,14 @@ def test_scc_vector_inlined_call(frontend, horizontal): transform.apply(routine, role='kernel', targets=['some_kernel', 'some_inlined_kernel']) # Check loki pragma has been removed - assert not FindNodes(Pragma).visit(routine.body) + assert not FindNodes(ir.Pragma).visit(routine.body) # Check that 'some_inlined_kernel' remains within vector-parallel region - loops = FindNodes(Loop).visit(routine.body) + loops = FindNodes(ir.Loop).visit(routine.body) assert len(loops) == 1 - calls = FindNodes(CallStatement).visit(loops[0].body) + calls = FindNodes(ir.CallStatement).visit(loops[0].body) assert len(calls) == 1 - calls = FindNodes(CallStatement).visit(routine.body) + calls = FindNodes(ir.CallStatement).visit(routine.body) assert len(calls) == 2 @@ -407,9 +409,12 @@ def test_scc_vector_section_trim_simple(frontend, horizontal, trim_vector_sectio for transform in scc_transform: transform.apply(routine, role='kernel', targets=['some_kernel',]) - assign = FindNodes(Assignment).visit(routine.body)[0] - loop = FindNodes(Loop).visit(routine.body)[0] - comment = [c for c in FindNodes(Comment).visit(routine.body) if c.text == '! random comment'][0] + assign = FindNodes(ir.Assignment).visit(routine.body)[0] + loop = FindNodes(ir.Loop).visit(routine.body)[0] + comment = [ + c for c in FindNodes(ir.Comment).visit(routine.body) + if c.text == '! random comment' + ][0] # check we found the right assignment assert assign.lhs.name.lower() == 'flag0' @@ -471,8 +476,8 @@ def test_scc_vector_section_trim_nested(frontend, horizontal, trim_vector_sectio for transform in scc_transform: transform.apply(routine, role='kernel', targets=['some_kernel',]) - cond = FindNodes(Conditional).visit(routine.body)[0] - loop = FindNodes(Loop).visit(routine.body)[0] + cond = FindNodes(ir.Conditional).visit(routine.body)[0] + loop = FindNodes(ir.Loop).visit(routine.body)[0] if trim_vector_sections: assert cond not in loop.body @@ -523,19 +528,19 @@ def test_scc_vector_section_trim_complex( ) scc_pipeline.apply(routine, role='kernel', targets=['some_kernel',]) - assign = FindNodes(Assignment).visit(routine.body)[0] + assign = FindNodes(ir.Assignment).visit(routine.body)[0] # check we found the right assignment assert assign.lhs.name.lower() == 'flag1' - cond = FindNodes(Conditional).visit(routine.body)[0] - loop = FindNodes(Loop).visit(routine.body)[0] + cond = FindNodes(ir.Conditional).visit(routine.body)[0] + loop = FindNodes(ir.Loop).visit(routine.body)[0] assert cond in loop.body assert cond not in routine.body.body if trim_vector_sections: assert assign not in loop.body - assert(len(FindNodes(Assignment).visit(loop.body)) == 3) + assert(len(FindNodes(ir.Assignment).visit(loop.body)) == 3) else: assert assign in loop.body - assert(len(FindNodes(Assignment).visit(loop.body)) == 4) + assert(len(FindNodes(ir.Assignment).visit(loop.body)) == 4) From 6aa4fd791627225390c05d4671f0217f7586cf9d Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Wed, 7 Aug 2024 05:22:53 +0000 Subject: [PATCH 05/12] SCC: Remove vector-section label in SCCRevetor --- loki/transformations/single_column/annotate.py | 12 ------------ .../single_column/tests/test_scc_vector.py | 14 ++++++++++++-- loki/transformations/single_column/vector.py | 4 ++-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 5e5cd0e1f..38099247b 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -198,12 +198,6 @@ def process_kernel(self, routine): if self.directive == 'openacc': self.insert_annotations(routine, self.horizontal) - # Remove the vector section wrappers - # These have been inserted by SCCDevectorTransformation - section_mapper = {s: s.body for s in FindNodes(ir.Section).visit(routine.body) if s.label == 'vector_section'} - if section_mapper: - routine.body = Transformer(section_mapper).visit(routine.body) - def process_driver(self, routine, targets=None): """ Apply the relevant ``'openacc'`` annotations to the driver loop. @@ -241,12 +235,6 @@ def process_driver(self, routine, targets=None): # Mark all non-parallel loops as `!$acc loop seq` self.kernel_annotate_sequential_loops_openacc(routine) - # Remove the vector section wrappers - # These have been inserted by SCCDevectorTransformation - section_mapper = {s: s.body for s in FindNodes(ir.Section).visit(routine.body) if s.label == 'vector_section'} - if section_mapper: - routine.body = Transformer(section_mapper).visit(routine.body) - @classmethod def device_alloc_column_locals(cls, routine, column_locals): """ diff --git a/loki/transformations/single_column/tests/test_scc_vector.py b/loki/transformations/single_column/tests/test_scc_vector.py index 33a0f0c24..8057a5969 100644 --- a/loki/transformations/single_column/tests/test_scc_vector.py +++ b/loki/transformations/single_column/tests/test_scc_vector.py @@ -122,6 +122,10 @@ def test_scc_revector_transformation(frontend, horizontal): assert fgen(assigns[2]).lower() == 'q(jl, jk) = q(jl, jk - 1) + t(jl, jk)*c' assert fgen(assigns[3]).lower() == 'q(jl, nz) = q(jl, nz)*c' + # Ensure that vector-section labels have been removed + sections = FindNodes(ir.Section).visit(kernel.body) + assert all(not s.label for s in sections) + # Ensure driver remains unaffected driver_loops = FindNodes(ir.Loop).visit(driver.body) assert len(driver_loops) == 1 @@ -231,6 +235,10 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ assert fgen(assigns[2]).lower() == 'q(jl, jk) = q(jl, jk - 1) + t(jl, jk)*c' assert fgen(assigns[3]).lower() == 'q(jl, nz) = q(jl, nz)*c' + # Ensure that vector-section labels have been removed + sections = FindNodes(ir.Section).visit(kernel.body) + assert all(not s.label for s in sections) + # Ensure driver remains unaffected driver_loops = FindNodes(ir.Loop).visit(driver.body) assert len(driver_loops) == 1 @@ -364,8 +372,10 @@ def test_scc_vector_inlined_call(frontend, horizontal): for transform in scc_transform: transform.apply(routine, role='kernel', targets=['some_kernel', 'some_inlined_kernel']) - # Check loki pragma has been removed - assert not FindNodes(ir.Pragma).visit(routine.body) + # Check only `!$loki loop vector` pragma has been inserted + pragmas = FindNodes(ir.Pragma).visit(routine.body) + assert len(pragmas) == 1 + assert is_loki_pragma(pragmas[0], starts_with='loop vector') # Check that 'some_inlined_kernel' remains within vector-parallel region loops = FindNodes(ir.Loop).visit(routine.body) diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 4ff05540e..16f4b19c5 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -314,11 +314,11 @@ def revector_section(self, routine, section): """ # Wrap all thread-parallel sections into horizontal thread loops mapper = { - s.body: wrap_vector_section(s.body, routine, self.horizontal) + s: wrap_vector_section(s.body, routine, self.horizontal) for s in FindNodes(ir.Section).visit(section) if s.label == 'vector_section' } - return NestedTransformer(mapper).visit(section) + return Transformer(mapper).visit(section) def mark_seq_loops(self, section): """ From c1ce78340ba5c9b9ee3461c9ad7d0b9ace49c35c Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Wed, 7 Aug 2024 10:04:12 +0000 Subject: [PATCH 06/12] SCC: Mark driver loops in SCCRevector and SCCAnnotate translates --- .../transformations/single_column/annotate.py | 71 +++++++------------ .../single_column/tests/test_scc_vector.py | 32 +++++---- loki/transformations/single_column/vector.py | 34 ++++++--- loki/transformations/utilities.py | 5 +- 4 files changed, 75 insertions(+), 67 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 38099247b..2dcff09de 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -13,7 +13,7 @@ ) from loki.ir import ( nodes as ir, FindNodes, Transformer, pragmas_attached, - pragma_regions_attached, is_loki_pragma + pragma_regions_attached, is_loki_pragma, get_pragma_parameters ) from loki.logging import info from loki.tools import as_tuple, flatten @@ -211,29 +211,16 @@ def process_driver(self, routine, targets=None): the transformation call tree. """ - # For the thread block size, find the horizontal size variable that is available in - # the driver - num_threads = None - symbol_map = routine.symbol_map - for size_expr in self.horizontal.size_expressions: - if size_expr in symbol_map: - num_threads = size_expr - break + # Mark all parallel vector loops as `!$acc loop vector` + self.kernel_annotate_vector_loops_openacc(routine) + + # Mark all non-parallel loops as `!$acc loop seq` + self.kernel_annotate_sequential_loops_openacc(routine) with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): driver_loops = find_driver_loops(routine=routine, targets=targets) for loop in driver_loops: - loops = FindNodes(ir.Loop).visit(loop.body) - kernel_loops = [l for l in loops if l.variable == self.horizontal.index] - if kernel_loops: - assert not loop == kernel_loops[0] - self.annotate_driver( - self.directive, loop, kernel_loops, self.block_dim, num_threads - ) - - if self.directive == 'openacc': - # Mark all non-parallel loops as `!$acc loop seq` - self.kernel_annotate_sequential_loops_openacc(routine) + self.annotate_driver(self.directive, loop, self.block_dim) @classmethod def device_alloc_column_locals(cls, routine, column_locals): @@ -257,7 +244,7 @@ def device_alloc_column_locals(cls, routine, column_locals): routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) @classmethod - def annotate_driver(cls, directive, driver_loop, kernel_loops, block_dim, num_threads): + def annotate_driver(cls, directive, driver_loop, block_dim): """ Annotate driver block loop with ``'openacc'`` pragmas. @@ -273,8 +260,6 @@ def annotate_driver(cls, directive, driver_loop, kernel_loops, block_dim, num_th block_dim : :any:`Dimension` Optional ``Dimension`` object to define the blocking dimension to detect hoisted temporary arrays and excempt them from marking. - num_threads : str - The size expression that determines the number of threads per thread block """ # Mark driver loop as "gang parallel". @@ -289,25 +274,21 @@ def annotate_driver(cls, directive, driver_loop, kernel_loops, block_dim, num_th arrays = [v for v in arrays if not any(d in sizes for d in as_tuple(v.shape))] private_arrays = ', '.join(set(v.name for v in arrays)) private_clause = '' if not private_arrays else f' private({private_arrays})' - vector_length_clause = '' if not num_threads else f' vector_length({num_threads})' - - # Annotate vector loops with OpenACC pragmas - if kernel_loops: - for loop in as_tuple(kernel_loops): - loop._update(pragma=(ir.Pragma(keyword='acc', content='loop vector'),)) - - if driver_loop.pragma is None or (len(driver_loop.pragma) == 1 and - driver_loop.pragma[0].keyword.lower() == "loki" and - driver_loop.pragma[0].content.lower() == "driver-loop"): - p_content = f'parallel loop gang{private_clause}{vector_length_clause}' - driver_loop._update(pragma=(ir.Pragma(keyword='acc', content=p_content),)) - driver_loop._update(pragma_post=(ir.Pragma(keyword='acc', content='end parallel loop'),)) - - # add acc parallel loop gang if the only existing pragma is acc data - elif len(driver_loop.pragma) == 1: - if (driver_loop.pragma[0].keyword == 'acc' and - driver_loop.pragma[0].content.lower().lstrip().startswith('data ')): - p_content = f'parallel loop gang{private_clause}{vector_length_clause}' - driver_loop._update(pragma=(driver_loop.pragma[0], ir.Pragma(keyword='acc', content=p_content))) - driver_loop._update(pragma_post=(ir.Pragma(keyword='acc', content='end parallel loop'), - driver_loop.pragma_post[0])) + + for pragma in as_tuple(driver_loop.pragma): + if is_loki_pragma(pragma, starts_with='loop driver'): + # Replace `!$loki loop driver` pragma with OpenACC equivalent + params = get_pragma_parameters(driver_loop.pragma, starts_with='loop driver') + vlength = params.get('vector_length') + vlength_clause = f' vector_length({vlength})' if vlength else '' + + content = f'parallel loop gang{private_clause}{vlength_clause}' + pragma_new = ir.Pragma(keyword='acc', content=content) + pragma_post = ir.Pragma(keyword='acc', content='end parallel loop') + + # Replace existing loki pragma and add post-pragma + loop_pragmas = tuple(p for p in as_tuple(driver_loop.pragma) if p is not pragma) + driver_loop._update( + pragma=loop_pragmas + (pragma_new,), + pragma_post=(pragma_post,) + as_tuple(driver_loop.pragma_post) + ) diff --git a/loki/transformations/single_column/tests/test_scc_vector.py b/loki/transformations/single_column/tests/test_scc_vector.py index 8057a5969..a496e03ce 100644 --- a/loki/transformations/single_column/tests/test_scc_vector.py +++ b/loki/transformations/single_column/tests/test_scc_vector.py @@ -96,7 +96,7 @@ def test_scc_revector_transformation(frontend, horizontal): scc_transform = (SCCDevectorTransformation(horizontal=horizontal),) scc_transform += (SCCRevectorTransformation(horizontal=horizontal),) for transform in scc_transform: - transform.apply(driver, role='driver') + transform.apply(driver, role='driver', targets=('compute_column',)) transform.apply(kernel, role='kernel') # Ensure we have two nested loops in the kernel @@ -126,11 +126,15 @@ def test_scc_revector_transformation(frontend, horizontal): sections = FindNodes(ir.Section).visit(kernel.body) assert all(not s.label for s in sections) - # Ensure driver remains unaffected - driver_loops = FindNodes(ir.Loop).visit(driver.body) - assert len(driver_loops) == 1 - assert driver_loops[0].variable == 'b' - assert driver_loops[0].bounds == '1:nb' + # Ensure driver remains unaffected and is marked + with pragmas_attached(driver, node_type=ir.Loop): + driver_loops = FindNodes(ir.Loop).visit(driver.body) + assert len(driver_loops) == 1 + assert driver_loops[0].variable == 'b' + assert driver_loops[0].bounds == '1:nb' + assert driver_loops[0].pragma and len(driver_loops[0].pragma) == 1 + assert is_loki_pragma(driver_loops[0].pragma[0], starts_with='loop driver') + assert 'vector_length(nlon)' in driver_loops[0].pragma[0].content kernel_calls = FindNodes(ir.CallStatement).visit(driver_loops[0]) assert len(kernel_calls) == 1 @@ -209,7 +213,7 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ scc_transform = (SCCDevectorTransformation(horizontal=horizontal_bounds_aliases),) scc_transform += (SCCRevectorTransformation(horizontal=horizontal_bounds_aliases),) for transform in scc_transform: - transform.apply(driver, role='driver') + transform.apply(driver, role='driver', targets=('compute_column',)) transform.apply(kernel, role='kernel') # Ensure we have two nested loops in the kernel @@ -239,11 +243,15 @@ def test_scc_revector_transformation_aliased_bounds(frontend, horizontal_bounds_ sections = FindNodes(ir.Section).visit(kernel.body) assert all(not s.label for s in sections) - # Ensure driver remains unaffected - driver_loops = FindNodes(ir.Loop).visit(driver.body) - assert len(driver_loops) == 1 - assert driver_loops[0].variable == 'b' - assert driver_loops[0].bounds == '1:nb' + # Ensure driver remains unaffected and is marked + with pragmas_attached(driver, node_type=ir.Loop): + driver_loops = FindNodes(ir.Loop).visit(driver.body) + assert len(driver_loops) == 1 + assert driver_loops[0].variable == 'b' + assert driver_loops[0].bounds == '1:nb' + assert driver_loops[0].pragma and len(driver_loops[0].pragma) == 1 + assert is_loki_pragma(driver_loops[0].pragma[0], starts_with='loop driver') + assert 'vector_length(nlon)' in driver_loops[0].pragma[0].content kernel_calls = FindNodes(ir.CallStatement).visit(driver_loops[0]) assert len(kernel_calls) == 1 diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 16f4b19c5..9b175b893 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -343,6 +343,29 @@ def mark_seq_loops(self, section): if loop.variable != self.horizontal.index: loop._update(pragma=(ir.Pragma(keyword='loki', content='loop seq'),)) + def mark_driver_loop(self, routine, loop): + """ + Add ``!$loki loop driver`` pragmas to outer block loops and + add ``vector-length(size)`` clause for later annotations. + + This method assumes that pragmas have been attached via + :any:`pragmas_attached`. + """ + # Find a horizontal size variable to mark vector_length + symbol_map = routine.symbol_map + sizes = tuple( + symbol_map.get(size) for size in self.horizontal.size_expressions + if size in symbol_map + ) + vector_length = f' vector_length({sizes[0]})' if sizes else '' + + # Replace existing `!$loki loop driver markers, but leave all others + pragma = ir.Pragma(keyword='loki', content=f'loop driver{vector_length}') + loop_pragmas = tuple( + p for p in as_tuple(loop.pragma) if not is_loki_pragma(p, starts_with='driver-loop') + ) + loop._update(pragma=loop_pragmas + (pragma,)) + def transform_subroutine(self, routine, **kwargs): """ Wrap vector-parallel sections in vector :any:`Loop` objects. @@ -373,7 +396,7 @@ def transform_subroutine(self, routine, **kwargs): self.mark_seq_loops(routine.body) if role == 'driver': - with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): + with pragmas_attached(routine, ir.Loop): driver_loops = find_driver_loops(routine=routine, targets=targets) for loop in driver_loops: @@ -383,13 +406,8 @@ def transform_subroutine(self, routine, **kwargs): # Mark sequential loops inside vector sections self.mark_seq_loops(loop.body) - if self.remove_vector_section: - # Remove the vector section wrappers - # These have been inserted by SCCDevectorTransformation - section_mapper = {s: s.body for s in FindNodes(ir.Section).visit(routine.body) - if s.label == 'vector_section'} - if section_mapper: - routine.body = Transformer(section_mapper).visit(routine.body) + # Mark outer driver loops + self.mark_driver_loop(routine, loop) class SCCDemoteTransformation(Transformation): diff --git a/loki/transformations/utilities.py b/loki/transformations/utilities.py index 9fb7ad07b..a5d8051c5 100644 --- a/loki/transformations/utilities.py +++ b/loki/transformations/utilities.py @@ -19,7 +19,7 @@ ) from loki.ir import ( nodes as ir, Import, TypeDef, VariableDeclaration, - StatementFunction, Transformer, FindNodes + StatementFunction, Transformer, FindNodes, is_loki_pragma ) from loki.module import Module from loki.subroutine import Subroutine @@ -585,7 +585,8 @@ def is_driver_loop(loop, targets): """ if loop.pragma: for pragma in loop.pragma: - if pragma.keyword.lower() == "loki" and pragma.content.lower() == "driver-loop": + if is_loki_pragma(pragma, starts_with='driver-loop') or \ + is_loki_pragma(pragma, starts_with='loop driver'): return True for call in FindNodes(ir.CallStatement).visit(loop.body): if call.name in targets: From f2eedf6468003efddf93acc6b884e37561a4d7bd Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Fri, 9 Aug 2024 04:43:00 +0000 Subject: [PATCH 07/12] SCC: Move vector-reduction region processing to SCCRevector --- .../transformations/single_column/annotate.py | 35 ++++++------------ loki/transformations/single_column/vector.py | 37 ++++++++++++++++++- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 2dcff09de..7cbbb47fd 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -5,15 +5,13 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. -import re - from loki.batch import Transformation from loki.expression import ( symbols as sym, FindVariables, is_dimension_constant ) from loki.ir import ( nodes as ir, FindNodes, Transformer, pragmas_attached, - pragma_regions_attached, is_loki_pragma, get_pragma_parameters + is_loki_pragma, get_pragma_parameters ) from loki.logging import info from loki.tools import as_tuple, flatten @@ -76,28 +74,19 @@ def kernel_annotate_vector_loops_openacc(cls, routine): f'{[a.name for a in private_arrays]}' ) - mapper = {} - with pragma_regions_attached(routine): - for region in FindNodes(ir.PragmaRegion).visit(routine.body): - if is_loki_pragma(region.pragma, starts_with='vector-reduction'): - if (reduction_clause := re.search(r'reduction\([\w:0-9 \t]+\)', region.pragma.content)): - - loops = FindNodes(ir.Loop).visit(region) - assert len(loops) == 1 - pragma = ir.Pragma(keyword='acc', content=f'loop vector {reduction_clause[0]}') - # Update loop and region in place to remove marker pragmas - loops[0]._update(pragma=(pragma,)) - region._update(pragma=None, pragma_post=None) - with pragmas_attached(routine, ir.Loop): for loop in FindNodes(ir.Loop).visit(routine.body): - if is_loki_pragma(loop.pragma, starts_with='loop vector'): - # Construct pragma and wrap entire body in vector loop - private_arrs = ', '.join(v.name for v in private_arrays) - pragma = () - private_clause = '' if not private_arrays else f' private({private_arrs})' - pragma = ir.Pragma(keyword='acc', content=f'loop vector{private_clause}') - loop._update(pragma=(pragma,)) + for pragma in as_tuple(loop.pragma): + if is_loki_pragma(pragma, starts_with='loop vector reduction'): + # Turn reduction pragmas into `!$acc` equivalent + pragma._update(keyword='acc') + continue + + if is_loki_pragma(pragma, starts_with='loop vector'): + # Turn general vector pragmas into `!$acc` and add private clause + private_arrs = ', '.join(v.name for v in private_arrays) + private_clause = '' if not private_arrays else f' private({private_arrs})' + pragma._update(keyword='acc', content=f'loop vector{private_clause}') @classmethod def kernel_annotate_sequential_loops_openacc(cls, routine): diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 9b175b893..683b8025e 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -5,6 +5,8 @@ # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. +import re + from more_itertools import split_at from loki.analyse import dataflow_analysis_attached @@ -14,7 +16,7 @@ ) from loki.ir import ( nodes as ir, FindNodes, FindScopes, Transformer, - NestedTransformer, is_loki_pragma, pragmas_attached + NestedTransformer, is_loki_pragma, pragmas_attached, pragma_regions_attached ) from loki.tools import as_tuple, flatten from loki.types import BasicType @@ -320,6 +322,31 @@ def revector_section(self, routine, section): } return Transformer(mapper).visit(section) + def mark_vector_reductions(self, routine, section): + """ + Mark vector-reduction loops in marked vector-reduction + regions. + + If a region explicitly marked with + ``!$loki vector-reduction()``/ + ``!$loki end vector-reduction`` is encountered, we replace + existing ``!$loki loop vector`` loop pragmas and add the + reduction keyword and clause. These will be turned into + OpenACC equivalents by :any:`SCCAnnotate`. + """ + with pragma_regions_attached(routine): + for region in FindNodes(ir.PragmaRegion).visit(section): + if is_loki_pragma(region.pragma, starts_with='vector-reduction'): + if (reduction_clause := re.search(r'reduction\([\w:0-9 \t]+\)', region.pragma.content)): + + loops = FindNodes(ir.Loop).visit(region) + assert len(loops) == 1 + pragma = ir.Pragma(keyword='loki', content=f'loop vector {reduction_clause[0]}') + # Update loop and region in place to remove marker pragmas + loops[0]._update(pragma=(pragma,)) + region._update(pragma=None, pragma_post=None) + + def mark_seq_loops(self, section): """ Mark interior sequential loops in a thread-parallel section @@ -391,8 +418,11 @@ def transform_subroutine(self, routine, **kwargs): # Revector all marked vector sections within the kernel body routine.body = self.revector_section(routine, routine.body) - # Mark sequential loops inside vector sections with pragmas_attached(routine, ir.Loop): + # Check for explicitly labelled vector-reduction regions + self.mark_vector_reductions(routine, routine.body) + + # Mark sequential loops inside vector sections self.mark_seq_loops(routine.body) if role == 'driver': @@ -403,6 +433,9 @@ def transform_subroutine(self, routine, **kwargs): # Revector all marked sections within the driver loop body loop._update(body=self.revector_section(routine, loop.body)) + # Check for explicitly labelled vector-reduction regions + self.mark_vector_reductions(routine, loop.body) + # Mark sequential loops inside vector sections self.mark_seq_loops(loop.body) From 6308e2232978e0e59131c2c7df197cfcf1eb5179 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Fri, 9 Aug 2024 10:45:43 +0000 Subject: [PATCH 08/12] SCC: Let SCCRevector mark routine and SCCAnnotate translates This also renames and refactors the `check_routine_pragmas` utility, which now only needs to check for genuine `!$loki routine seq` annotations. --- .../block_index_transformations.py | 6 ++-- .../transformations/single_column/annotate.py | 21 ++++++----- loki/transformations/single_column/base.py | 4 +-- .../single_column/tests/test_scc.py | 14 +++++--- loki/transformations/single_column/vector.py | 11 ++++-- loki/transformations/tests/test_utilities.py | 18 +++++----- loki/transformations/utilities.py | 36 ++++--------------- 7 files changed, 50 insertions(+), 60 deletions(-) diff --git a/loki/transformations/block_index_transformations.py b/loki/transformations/block_index_transformations.py index c77f550a8..65e017a2f 100644 --- a/loki/transformations/block_index_transformations.py +++ b/loki/transformations/block_index_transformations.py @@ -21,7 +21,7 @@ from loki.transformations.sanitise import resolve_associates from loki.transformations.utilities import ( recursive_expression_map_update, get_integer_variable, - get_loop_bounds, check_routine_pragmas + get_loop_bounds, check_routine_sequential ) from loki.transformations.single_column.base import SCCBaseTransformation @@ -246,8 +246,8 @@ def process_kernel(self, routine, item, successors, targets, exclude_arrays): v_index = get_integer_variable(routine, name=self.horizontal.index) SCCBaseTransformation.resolve_masked_stmts(routine, loop_variable=v_index) - # Bail if routine is marked as sequential or routine has already been processed - if check_routine_pragmas(routine, directive=None): + # Bail if routine is marked as sequential + if check_routine_sequential(routine): return bounds = get_loop_bounds(routine, self.horizontal) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 7cbbb47fd..46b6fd468 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -18,7 +18,7 @@ from loki.types import DerivedType from loki.transformations.utilities import ( - find_driver_loops, get_local_arrays, check_routine_pragmas + find_driver_loops, get_local_arrays ) @@ -116,7 +116,8 @@ def kernel_annotate_sequential_loops_openacc(cls, routine): @classmethod def kernel_annotate_subroutine_present_openacc(cls, routine): """ - Insert ``!$acc data present`` annotations around the body of a subroutine. + Insert ``!$acc routine seq/vector`` directives and wrap + subroutine body in ``!$acc data present`` directives. Parameters ---------- @@ -124,6 +125,11 @@ def kernel_annotate_subroutine_present_openacc(cls, routine): The subroutine to which annotations will be added """ + # Update `!$loki routine seq/vector` pragmas with `!$acc` + for pragma in FindNodes(ir.Pragma).visit(routine.ir): + if is_loki_pragma(pragma, starts_with='routine'): + pragma._update(keyword='acc') + # Get the names of all array and derived type arguments args = [a for a in routine.arguments if isinstance(a, sym.Array)] args += [a for a in routine.arguments if isinstance(a.type.dtype, DerivedType)] @@ -146,9 +152,6 @@ def insert_annotations(cls, routine, horizontal): # to ensure device-resident data is used for array and struct arguments. cls.kernel_annotate_subroutine_present_openacc(routine) - # Mark routine as `!$acc routine vector` to make it device-callable - routine.spec.append(ir.Pragma(keyword='acc', content='routine vector')) - def transform_subroutine(self, routine, **kwargs): """ Apply SCCAnnotate utilities to a :any:`Subroutine`. @@ -180,9 +183,11 @@ def process_kernel(self, routine): Subroutine to apply this transformation to. """ - # Bail if routine is marked as sequential - if check_routine_pragmas(routine, self.directive): - return + # Bail if this routine has been processed before + for p in FindNodes(ir.Pragma).visit(routine.ir): + # Check if `!$acc routine` has already been added + if p.keyword.lower() == 'acc' and 'routine' in p.content.lower(): + return if self.directive == 'openacc': self.insert_annotations(routine, self.horizontal) diff --git a/loki/transformations/single_column/base.py b/loki/transformations/single_column/base.py index be730c418..b69f14f1d 100644 --- a/loki/transformations/single_column/base.py +++ b/loki/transformations/single_column/base.py @@ -14,7 +14,7 @@ from loki.transformations.sanitise import resolve_associates from loki.transformations.utilities import ( - get_integer_variable, get_loop_bounds, check_routine_pragmas + get_integer_variable, get_loop_bounds, check_routine_sequential ) @@ -164,7 +164,7 @@ def process_kernel(self, routine): """ # Bail if routine is marked as sequential or routine has already been processed - if check_routine_pragmas(routine, self.directive): + if check_routine_sequential(routine): return # Bail if routine is elemental diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index ffb0fe44b..4f7a935d2 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -753,9 +753,10 @@ def test_scc_multiple_acc_pragmas(frontend, horizontal, blocking): @pytest.mark.parametrize('frontend', available_frontends()) -def test_scc_base_routine_seq_pragma(frontend, horizontal): +def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): """ - Test that `!$loki routine seq` pragmas are replaced correctly by `!$acc routine seq` pragmas. + Test that `!$loki routine seq` pragmas are replaced correctly by + `!$acc routine seq` pragmas. """ fcode = """ @@ -781,8 +782,12 @@ def test_scc_base_routine_seq_pragma(frontend, horizontal): assert pragmas[0].keyword == 'loki' assert pragmas[0].content == 'routine seq' - transformation = SCCBaseTransformation(horizontal=horizontal, directive='openacc') - transformation.transform_subroutine(routine, role='kernel', targets=['some_kernel',]) + transformation = SCCAnnotateTransformation( + horizontal=horizontal, directive='openacc', block_dim=blocking + ) + transformation.transform_subroutine( + routine, role='kernel', targets=['some_kernel',] + ) pragmas = FindNodes(Pragma).visit(routine.spec) assert len(pragmas) == 1 @@ -790,7 +795,6 @@ def test_scc_base_routine_seq_pragma(frontend, horizontal): assert pragmas[0].content == 'routine seq' - @pytest.mark.parametrize('frontend', available_frontends()) def test_scc_vector_reduction(frontend, horizontal, blocking): """ diff --git a/loki/transformations/single_column/vector.py b/loki/transformations/single_column/vector.py index 683b8025e..7d200d960 100644 --- a/loki/transformations/single_column/vector.py +++ b/loki/transformations/single_column/vector.py @@ -24,7 +24,7 @@ from loki.transformations.array_indexing import demote_variables from loki.transformations.utilities import ( get_integer_variable, get_loop_bounds, find_driver_loops, - get_local_arrays, check_routine_pragmas + get_local_arrays, check_routine_sequential ) @@ -99,7 +99,7 @@ def extract_vector_sections(cls, section, horizontal): # check if calls have been enriched if not call.routine is BasicType.DEFERRED: # check if called routine is marked as sequential - if check_routine_pragmas(routine=call.routine, directive=None): + if check_routine_sequential(routine=call.routine): continue if call in section: @@ -415,6 +415,10 @@ def transform_subroutine(self, routine, **kwargs): targets = kwargs.get('targets', ()) if role == 'kernel': + # Skip if kernel is marked as `!$loki routine seq` + if check_routine_sequential(routine): + return + # Revector all marked vector sections within the kernel body routine.body = self.revector_section(routine, routine.body) @@ -425,6 +429,9 @@ def transform_subroutine(self, routine, **kwargs): # Mark sequential loops inside vector sections self.mark_seq_loops(routine.body) + # Mark subroutine as vector parallel for later annotation + routine.spec.append(ir.Pragma(keyword='loki', content='routine vector')) + if role == 'driver': with pragmas_attached(routine, ir.Loop): driver_loops = find_driver_loops(routine=routine, targets=targets) diff --git a/loki/transformations/tests/test_utilities.py b/loki/transformations/tests/test_utilities.py index f9fb65697..53a91ca74 100644 --- a/loki/transformations/tests/test_utilities.py +++ b/loki/transformations/tests/test_utilities.py @@ -19,7 +19,7 @@ single_variable_declaration, recursive_expression_map_update, convert_to_lower_case, replace_intrinsics, rename_variables, get_integer_variable, get_loop_bounds, is_driver_loop, - find_driver_loops, get_local_arrays, check_routine_pragmas + find_driver_loops, get_local_arrays, check_routine_sequential ) @@ -520,11 +520,11 @@ def test_transform_utilites_get_local_arrays(frontend, tmp_path): @pytest.mark.parametrize('frontend', available_frontends()) -def test_transform_utilites_check_routine_pragmas(frontend, tmp_path): - """ Test :any:`check_routine_pragmas` utility. """ +def test_transform_utilites_check_routine_sequential(frontend, tmp_path): + """ Test :any:`check_routine_sequential` utility. """ fcode = """ -module test_check_routine_pragmas_mod +module test_check_routine_sequential_mod implicit none contains @@ -546,12 +546,10 @@ def test_transform_utilites_check_routine_pragmas(frontend, tmp_path): i = i + 1 end subroutine test_acc_vec -end module test_check_routine_pragmas_mod +end module test_check_routine_sequential_mod """ module = Module.from_source(fcode, frontend=frontend, xmods=[tmp_path]) - # TODO: This utility needs some serious clean-up, so we're just testing - # the bare basics here and promise to do better next time ;) - assert check_routine_pragmas(module['test_acc_seq'], directive=None) - assert check_routine_pragmas(module['test_loki_seq'], directive=None) - assert check_routine_pragmas(module['test_acc_vec'], directive='openacc') + assert not check_routine_sequential(module['test_acc_seq']) + assert check_routine_sequential(module['test_loki_seq']) + assert not check_routine_sequential(module['test_acc_vec']) diff --git a/loki/transformations/utilities.py b/loki/transformations/utilities.py index a5d8051c5..76ab9eafd 100644 --- a/loki/transformations/utilities.py +++ b/loki/transformations/utilities.py @@ -32,7 +32,7 @@ 'sanitise_imports', 'replace_selected_kind', 'single_variable_declaration', 'recursive_expression_map_update', 'get_integer_variable', 'get_loop_bounds', 'find_driver_loops', - 'get_local_arrays', 'check_routine_pragmas' + 'get_local_arrays', 'check_routine_sequential' ] @@ -652,41 +652,17 @@ def get_local_arrays(routine, section, unique=True): return arrays -def check_routine_pragmas(routine, directive): +def check_routine_sequential(routine): """ - Check if routine is marked as sequential or has already been processed. + Check if routine is marked as "sequential". Parameters ---------- routine : :any:`Subroutine` Subroutine to perform checks on. - directive: string or None - Directives flavour to use for parallelism annotations; either - ``'openacc'`` or ``None``. - """ - - pragmas = FindNodes(ir.Pragma).visit(routine.ir) - routine_pragmas = [p for p in pragmas if p.keyword.lower() in ['loki', 'acc']] - routine_pragmas = [p for p in routine_pragmas if 'routine' in p.content.lower()] - - seq_pragmas = [r for r in routine_pragmas if 'seq' in r.content.lower()] - if seq_pragmas: - loki_seq_pragmas = [r for r in routine_pragmas if 'loki' == r.keyword.lower()] - if loki_seq_pragmas: - if directive == 'openacc': - # Mark routine as acc seq - mapper = {seq_pragmas[0]: None} - routine.spec = Transformer(mapper).visit(routine.spec) - routine.body = Transformer(mapper).visit(routine.body) - - # Append the acc pragma to routine.spec, regardless of where the corresponding - # loki pragma is found - routine.spec.append(ir.Pragma(keyword='acc', content='routine seq')) - return True - - vec_pragmas = [r for r in routine_pragmas if 'vector' in r.content.lower()] - if vec_pragmas: - if directive == 'openacc': + """ + for pragma in FindNodes(ir.Pragma).visit(routine.ir): + if is_loki_pragma(pragma, starts_with='routine seq'): return True return False From ad34d09129736f36345fb5b24d87766e3be3775a Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Fri, 9 Aug 2024 11:33:44 +0000 Subject: [PATCH 09/12] SCC: Tidy up SCCAnnotate; rename methods and remove horizontal We also change the static classmethods to regular methods to provide acces to the `directive` property in the follow-up. --- .../transformations/single_column/annotate.py | 143 +++++++----------- .../single_column/tests/test_scc.py | 15 +- .../single_column/tests/test_scc_hoist.py | 4 +- 3 files changed, 60 insertions(+), 102 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 46b6fd468..84cfe09fa 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -10,8 +10,8 @@ symbols as sym, FindVariables, is_dimension_constant ) from loki.ir import ( - nodes as ir, FindNodes, Transformer, pragmas_attached, - is_loki_pragma, get_pragma_parameters + nodes as ir, FindNodes, pragmas_attached, is_loki_pragma, + get_pragma_parameters ) from loki.logging import info from loki.tools import as_tuple, flatten @@ -32,9 +32,6 @@ class SCCAnnotateTransformation(Transformation): Parameters ---------- - horizontal : :any:`Dimension` - :any:`Dimension` object describing the variable conventions used in code - to define the horizontal data dimension and iteration space. block_dim : :any:`Dimension` Optional ``Dimension`` object to define the blocking dimension to use for hoisted column arrays if hoisting is enabled. @@ -43,16 +40,14 @@ class SCCAnnotateTransformation(Transformation): ``'openacc'`` or ``None``. """ - def __init__(self, horizontal, directive, block_dim): - self.horizontal = horizontal + def __init__(self, directive, block_dim): self.directive = directive self.block_dim = block_dim - @classmethod - def kernel_annotate_vector_loops_openacc(cls, routine): + def annotate_vector_loops(self, routine): """ - Insert ``!$acc loop vector`` annotations around horizontal vector - loops, including the necessary private variable declarations. + Insert ``!$acc loop vector`` for previously marked loops, + including addition of the necessary private variable declarations. Parameters ---------- @@ -88,8 +83,7 @@ def kernel_annotate_vector_loops_openacc(cls, routine): private_clause = '' if not private_arrays else f' private({private_arrs})' pragma._update(keyword='acc', content=f'loop vector{private_clause}') - @classmethod - def kernel_annotate_sequential_loops_openacc(cls, routine): + def annotate_sequential_loops(self, routine): """ Insert ``!$acc loop seq`` annotations for all loops previously marked with ``!$loki loop seq``. @@ -113,8 +107,7 @@ def kernel_annotate_sequential_loops_openacc(cls, routine): if any('loop vector' in pragma.content for pragma in loop_pragmas): info(f'[Loki-SCC::Annotate] Detected vector loop in sequential loop in {routine.name}') - @classmethod - def kernel_annotate_subroutine_present_openacc(cls, routine): + def annotate_kernel_routine(self, routine): """ Insert ``!$acc routine seq/vector`` directives and wrap subroutine body in ``!$acc data present`` directives. @@ -139,22 +132,20 @@ def kernel_annotate_subroutine_present_openacc(cls, routine): # Add comment to prevent false-attachment in case it is preceded by an "END DO" statement routine.body.append((ir.Comment(text=''), ir.Pragma(keyword='acc', content='end data'))) - @classmethod - def insert_annotations(cls, routine, horizontal): - - # Mark all parallel vector loops as `!$acc loop vector` - cls.kernel_annotate_vector_loops_openacc(routine) - - # Mark all non-parallel loops as `!$acc loop seq` - cls.kernel_annotate_sequential_loops_openacc(routine) - - # Wrap the routine body in `!$acc data present` markers - # to ensure device-resident data is used for array and struct arguments. - cls.kernel_annotate_subroutine_present_openacc(routine) - def transform_subroutine(self, routine, **kwargs): """ - Apply SCCAnnotate utilities to a :any:`Subroutine`. + Apply OpenACC annotations according to ``!$loki`` placeholder + directives. + + This routine effectively converts neutral ``!$loki loop`` and + ``!$loki routine`` annotations into the corresponding + ``!$acc`` equivalent directives. It also adds ``!$acc data + present`` clauses around kernel routine bodies and adds + ``private`` clauses to loop annotations. + + If the ``directive`` provided is not ``openacc``, no change is + applied. In the future, we aim to support ``OpenMP`` + equivalent directives here. Parameters ---------- @@ -167,54 +158,39 @@ def transform_subroutine(self, routine, **kwargs): role = kwargs['role'] targets = as_tuple(kwargs.get('targets')) + if not self.directive == 'openacc': + return + if role == 'kernel': - self.process_kernel(routine) - if role == 'driver': - self.process_driver(routine, targets=targets) + # Bail if this routine has been processed before + for p in FindNodes(ir.Pragma).visit(routine.ir): + # Check if `!$acc routine` has already been added + if p.keyword.lower() == 'acc' and 'routine' in p.content.lower(): + return - def process_kernel(self, routine): - """ - Applies the SCCAnnotate utilities to a "kernel". This consists of inserting the relevant - ``'openacc'`` annotations at the :any:`Loop` and :any:`Subroutine` level. + # Mark all parallel vector loops as `!$acc loop vector` + self.annotate_vector_loops(routine) - Parameters - ---------- - routine : :any:`Subroutine` - Subroutine to apply this transformation to. - """ + # Mark all non-parallel loops as `!$acc loop seq` + self.annotate_sequential_loops(routine) - # Bail if this routine has been processed before - for p in FindNodes(ir.Pragma).visit(routine.ir): - # Check if `!$acc routine` has already been added - if p.keyword.lower() == 'acc' and 'routine' in p.content.lower(): - return + # Wrap the routine body in `!$acc data present` markers to + # ensure all arguments are device-resident. + self.annotate_kernel_routine(routine) - if self.directive == 'openacc': - self.insert_annotations(routine, self.horizontal) - def process_driver(self, routine, targets=None): - """ - Apply the relevant ``'openacc'`` annotations to the driver loop. - - Parameters - ---------- - routine : :any:`Subroutine` - Subroutine to apply this transformation to. - targets : list or string - List of subroutines that are to be considered as part of - the transformation call tree. - """ + if role == 'driver': + # Mark all parallel vector loops as `!$acc loop vector` + self.annotate_vector_loops(routine) - # Mark all parallel vector loops as `!$acc loop vector` - self.kernel_annotate_vector_loops_openacc(routine) + # Mark all non-parallel loops as `!$acc loop seq` + self.annotate_sequential_loops(routine) - # Mark all non-parallel loops as `!$acc loop seq` - self.kernel_annotate_sequential_loops_openacc(routine) + with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): + driver_loops = find_driver_loops(routine=routine, targets=targets) + for loop in driver_loops: + self.annotate_driver_loop(loop) - with pragmas_attached(routine, ir.Loop, attach_pragma_post=True): - driver_loops = find_driver_loops(routine=routine, targets=targets) - for loop in driver_loops: - self.annotate_driver(self.directive, loop, self.block_dim) @classmethod def device_alloc_column_locals(cls, routine, column_locals): @@ -237,42 +213,33 @@ def device_alloc_column_locals(cls, routine, column_locals): routine.body.prepend((ir.Comment(''), pragma, ir.Comment(''))) routine.body.append((ir.Comment(''), pragma_post, ir.Comment(''))) - @classmethod - def annotate_driver(cls, directive, driver_loop, block_dim): + def annotate_driver_loop(self, loop): """ Annotate driver block loop with ``'openacc'`` pragmas. Parameters ---------- - directive : string or None - Directives flavour to use for parallelism annotations; either - ``'openacc'`` or ``None``. - driver_loop : :any:`Loop` - Driver ``Loop`` to wrap in ``'opencc'`` pragmas. - kernel_loops : list of :any:`Loop` - Vector ``Loop`` to wrap in ``'opencc'`` pragmas if hoisting is enabled. - block_dim : :any:`Dimension` - Optional ``Dimension`` object to define the blocking dimension - to detect hoisted temporary arrays and excempt them from marking. + loop : :any:`Loop` + Driver :any:`Loop` to wrap in ``'opencc'`` pragmas. """ # Mark driver loop as "gang parallel". - if directive == 'openacc': - arrays = FindVariables(unique=True).visit(driver_loop) + if self.directive == 'openacc': + arrays = FindVariables(unique=True).visit(loop) arrays = [v for v in arrays if isinstance(v, sym.Array)] arrays = [v for v in arrays if not v.type.intent] arrays = [v for v in arrays if not v.type.pointer] # Filter out arrays that are explicitly allocated with block dimension - sizes = block_dim.size_expressions + sizes = self.block_dim.size_expressions arrays = [v for v in arrays if not any(d in sizes for d in as_tuple(v.shape))] private_arrays = ', '.join(set(v.name for v in arrays)) private_clause = '' if not private_arrays else f' private({private_arrays})' - for pragma in as_tuple(driver_loop.pragma): + for pragma in as_tuple(loop.pragma): if is_loki_pragma(pragma, starts_with='loop driver'): # Replace `!$loki loop driver` pragma with OpenACC equivalent - params = get_pragma_parameters(driver_loop.pragma, starts_with='loop driver') + params = get_pragma_parameters(loop.pragma, starts_with='loop driver') vlength = params.get('vector_length') vlength_clause = f' vector_length({vlength})' if vlength else '' @@ -281,8 +248,8 @@ def annotate_driver(cls, directive, driver_loop, block_dim): pragma_post = ir.Pragma(keyword='acc', content='end parallel loop') # Replace existing loki pragma and add post-pragma - loop_pragmas = tuple(p for p in as_tuple(driver_loop.pragma) if p is not pragma) - driver_loop._update( + loop_pragmas = tuple(p for p in as_tuple(loop.pragma) if p is not pragma) + loop._update( pragma=loop_pragmas + (pragma_new,), - pragma_post=(pragma_post,) + as_tuple(driver_loop.pragma_post) + pragma_post=(pragma_post,) + as_tuple(loop.pragma_post) ) diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index 4f7a935d2..99b4eb5b2 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -298,8 +298,7 @@ def test_scc_annotate_openacc(frontend, horizontal, blocking): scc_transform = (SCCDevectorTransformation(horizontal=horizontal),) scc_transform += (SCCDemoteTransformation(horizontal=horizontal),) scc_transform += (SCCRevectorTransformation(horizontal=horizontal),) - scc_transform += (SCCAnnotateTransformation(horizontal=horizontal, - directive='openacc', block_dim=blocking),) + scc_transform += (SCCAnnotateTransformation(directive='openacc', block_dim=blocking),) for transform in scc_transform: transform.apply(driver, role='driver', targets=['compute_column']) transform.apply(kernel, role='kernel') @@ -407,9 +406,7 @@ def test_scc_nested(frontend, horizontal, blocking): scc_pipeline.apply(inner_kernel, role='kernel') # Apply annotate twice to test bailing out mechanism - scc_annotate = SCCAnnotateTransformation( - horizontal=horizontal, directive='openacc', block_dim=blocking - ) + scc_annotate = SCCAnnotateTransformation(directive='openacc', block_dim=blocking) scc_annotate.apply(driver, role='driver', targets=['compute_column']) scc_annotate.apply(outer_kernel, role='kernel', targets=['compute_q']) scc_annotate.apply(inner_kernel, role='kernel') @@ -782,12 +779,8 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): assert pragmas[0].keyword == 'loki' assert pragmas[0].content == 'routine seq' - transformation = SCCAnnotateTransformation( - horizontal=horizontal, directive='openacc', block_dim=blocking - ) - transformation.transform_subroutine( - routine, role='kernel', targets=['some_kernel',] - ) + transformation = SCCAnnotateTransformation(directive='openacc', block_dim=blocking) + transformation.transform_subroutine(routine, role='kernel', targets=['some_kernel',]) pragmas = FindNodes(Pragma).visit(routine.spec) assert len(pragmas) == 1 diff --git a/loki/transformations/single_column/tests/test_scc_hoist.py b/loki/transformations/single_column/tests/test_scc_hoist.py index 533e89bb6..344c980a9 100644 --- a/loki/transformations/single_column/tests/test_scc_hoist.py +++ b/loki/transformations/single_column/tests/test_scc_hoist.py @@ -261,9 +261,7 @@ def test_scc_hoist_multiple_kernels_loops(tmp_path, frontend, trim_vector_sectio transformation += (SCCDevectorTransformation(horizontal=horizontal, trim_vector_sections=trim_vector_sections),) transformation += (SCCDemoteTransformation(horizontal=horizontal),) transformation += (SCCRevectorTransformation(horizontal=horizontal),) - transformation += (SCCAnnotateTransformation( - horizontal=horizontal, directive='openacc', block_dim=blocking, - ),) + transformation += (SCCAnnotateTransformation(directive='openacc', block_dim=blocking),) for transform in transformation: scheduler.process(transformation=transform) From d3c70fb500d4bf4707d0b6e958d99d34b3041151 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Mon, 9 Sep 2024 09:37:19 +0000 Subject: [PATCH 10/12] SingleColumn: Ensure that routine pragmas are always in the spec Theres' a subtle bug, where they can be attributed to the body, and thus need moving explicitly. This was done by the provious utility, but never checked explciitly - so now we do test it! --- loki/transformations/single_column/annotate.py | 11 +++++++++-- loki/transformations/single_column/tests/test_scc.py | 5 +++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index 84cfe09fa..d2d641576 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -11,7 +11,7 @@ ) from loki.ir import ( nodes as ir, FindNodes, pragmas_attached, is_loki_pragma, - get_pragma_parameters + get_pragma_parameters, Transformer ) from loki.logging import info from loki.tools import as_tuple, flatten @@ -119,9 +119,16 @@ def annotate_kernel_routine(self, routine): """ # Update `!$loki routine seq/vector` pragmas with `!$acc` + pragma_map = {} for pragma in FindNodes(ir.Pragma).visit(routine.ir): if is_loki_pragma(pragma, starts_with='routine'): - pragma._update(keyword='acc') + # We have to re-insert the pragma here, in case it was + # falsely attributed to the body! + routine.spec.append(pragma.clone(keyword='acc')) + pragma_map[pragma] = None + pragma_transformer = Transformer(pragma_map) + routine.spec = pragma_transformer.visit(routine.spec) + routine.body = pragma_transformer.visit(routine.body) # Get the names of all array and derived type arguments args = [a for a in routine.arguments if isinstance(a, sym.Array)] diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index 99b4eb5b2..474bef9c2 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -762,8 +762,8 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): integer, intent(in) :: nang real, dimension(nang), intent(inout) :: work -!$loki routine seq integer :: k +!$loki routine seq do k=1,nang work(k) = 1. @@ -774,7 +774,7 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): routine = Subroutine.from_source(fcode, frontend=frontend) - pragmas = FindNodes(Pragma).visit(routine.spec) + pragmas = FindNodes(Pragma).visit(routine.ir) assert len(pragmas) == 1 assert pragmas[0].keyword == 'loki' assert pragmas[0].content == 'routine seq' @@ -782,6 +782,7 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): transformation = SCCAnnotateTransformation(directive='openacc', block_dim=blocking) transformation.transform_subroutine(routine, role='kernel', targets=['some_kernel',]) + # Ensure the routine pragma is in the first pragma in the spec pragmas = FindNodes(Pragma).visit(routine.spec) assert len(pragmas) == 1 assert pragmas[0].keyword == 'acc' From 33514eb886bf063ce116127dd90e7df8ac3d0fe8 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Mon, 9 Sep 2024 09:45:21 +0000 Subject: [PATCH 11/12] SingleColumn: Fix corner case of empty `!$acc data` clauses Adds a small test and does not print data clauses if no arrays are passed to the routine. --- .../transformations/single_column/annotate.py | 7 ++-- .../single_column/tests/test_scc.py | 37 +++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/loki/transformations/single_column/annotate.py b/loki/transformations/single_column/annotate.py index d2d641576..d3283a963 100644 --- a/loki/transformations/single_column/annotate.py +++ b/loki/transformations/single_column/annotate.py @@ -135,9 +135,10 @@ def annotate_kernel_routine(self, routine): args += [a for a in routine.arguments if isinstance(a.type.dtype, DerivedType)] argnames = [str(a.name) for a in args] - routine.body.prepend(ir.Pragma(keyword='acc', content=f'data present({", ".join(argnames)})')) - # Add comment to prevent false-attachment in case it is preceded by an "END DO" statement - routine.body.append((ir.Comment(text=''), ir.Pragma(keyword='acc', content='end data'))) + if argnames: + routine.body.prepend(ir.Pragma(keyword='acc', content=f'data present({", ".join(argnames)})')) + # Add comment to prevent false-attachment in case it is preceded by an "END DO" statement + routine.body.append((ir.Comment(text=''), ir.Pragma(keyword='acc', content='end data'))) def transform_subroutine(self, routine, **kwargs): """ diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index 474bef9c2..acf15e169 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -789,6 +789,43 @@ def test_scc_annotate_routine_seq_pragma(frontend, horizontal, blocking): assert pragmas[0].content == 'routine seq' +@pytest.mark.parametrize('frontend', available_frontends()) +def test_scc_annotate_empty_data_clause(frontend, horizontal, blocking): + """ + Test that we do not generate empty `!$acc data` clauses. + """ + + fcode = """ + subroutine some_kernel(n) + implicit none + ! Scalars should not show up in `!$acc data` clause + integer, intent(inout) :: n +!$loki routine seq + integer :: k + + k = n + do k=1, 3 + n = k + 1. + enddo + end subroutine some_kernel + """ + routine = Subroutine.from_source(fcode, frontend=frontend) + + pragmas = FindNodes(Pragma).visit(routine.ir) + assert len(pragmas) == 1 + assert pragmas[0].keyword == 'loki' + assert pragmas[0].content == 'routine seq' + + transformation = SCCAnnotateTransformation(directive='openacc', block_dim=blocking) + transformation.transform_subroutine(routine, role='kernel', targets=['some_kernel',]) + + # Ensure the routine pragma is in the first pragma in the spec + pragmas = FindNodes(Pragma).visit(routine.ir) + assert len(pragmas) == 1 + assert pragmas[0].keyword == 'acc' + assert pragmas[0].content == 'routine seq' + + @pytest.mark.parametrize('frontend', available_frontends()) def test_scc_vector_reduction(frontend, horizontal, blocking): """ From 208a99b73b97224622d6a536a1fd54f1a31c8075 Mon Sep 17 00:00:00 2001 From: Michael Lange Date: Thu, 19 Sep 2024 09:43:02 +0000 Subject: [PATCH 12/12] SingleColum: Fix literal in new data clause test --- loki/transformations/single_column/tests/test_scc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loki/transformations/single_column/tests/test_scc.py b/loki/transformations/single_column/tests/test_scc.py index acf15e169..71549ceac 100644 --- a/loki/transformations/single_column/tests/test_scc.py +++ b/loki/transformations/single_column/tests/test_scc.py @@ -805,7 +805,7 @@ def test_scc_annotate_empty_data_clause(frontend, horizontal, blocking): k = n do k=1, 3 - n = k + 1. + n = k + 1 enddo end subroutine some_kernel """