From a70fafb3693db553f1198ea64b3c4dc081a1f593 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 10 Oct 2023 08:07:33 +0000 Subject: [PATCH] compiler: Revamp Parizer scoring function --- devito/passes/iet/langbase.py | 3 -- devito/passes/iet/parpragma.py | 74 ++++++++++++++++++++++------------ tests/test_dle.py | 18 +++++---- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 2acccba648f..91e68fc02b1 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,9 +214,6 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _device_pointers(self, *args, **kwargs): - return {} - class DeviceAwareMixin(object): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index a4513ef31fd..1deb4f3f8bf 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -1,3 +1,5 @@ +from itertools import takewhile + import numpy as np import cgen as c from cached_property import cached_property @@ -254,6 +256,36 @@ def nthreads_nonaffine(self): def threadid(self): return self.sregistry.threadid + def _score_candidate(self, n0, root, collapsable=()): + """ + The score of a collapsable nest depends on the number of fully-parallel + Iterations and their position in the nest (the outer, the better). + """ + nest = [root] + list(collapsable) + n = len(nest) + + # Number of fully-parallel collapsable Iterations + key = lambda i: i.is_ParallelNoAtomic + fpiters = list(takewhile(key, nest)) + nfpiters = len(fpiters) + + # Prioritize the Dimensions that are more likely to define larger + # iteration spaces + fpdims = [i.dim for i in fpiters] + key = lambda d: (not d.is_Derived or + d.is_Custom or # NOTE: might use a refinement + (d.is_Block and d._depth == 1)) + nfpiters_large = len([d for d in fpdims if key(d)]) + + return ( + int(nfpiters == n), # Fully-parallel nest + int(nfpiters == 0 and n), # Fully-atomic nest + nfpiters_large, + -(n0 + 1), # The outer, the better + nfpiters, + n, + ) + def _select_candidates(self, candidates): assert candidates @@ -263,15 +295,18 @@ def _select_candidates(self, candidates): mapper = {} for n0, root in enumerate(candidates): + # Score `root` in isolation + mapper[(root, ())] = self._score_candidate(n0, root) + collapsable = [] for n, i in enumerate(candidates[n0+1:], n0+1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break - # Loops are collapsable only if none of the iteration variables appear - # in initializer expressions. For example, the following two loops - # cannot be collapsed + # Loops are collapsable only if none of the iteration variables + # appear in initializer expressions. For example, the following + # two loops cannot be collapsed # # for (i = ... ) # for (j = i ...) @@ -281,7 +316,7 @@ def _select_candidates(self, candidates): if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]): break - # Also, we do not want to collapse SIMD-vectorized Iterations + # Can't collapse SIMD-vectorized Iterations if i.is_Vectorized: break @@ -297,17 +332,9 @@ def _select_candidates(self, candidates): collapsable.append(i) - # Give a score to this candidate, based on the number of fully-parallel - # Iterations and their position (i.e. outermost to innermost) in the nest - score = ( - int(root.is_ParallelNoAtomic), - len(self._device_pointers(root)), # Outermost offloadable - int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), - int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), - -(n0 + 1) # The outermost, the better - ) - - mapper[(root, tuple(collapsable))] = score + # Score `root + collapsable` + v = tuple(collapsable) + mapper[(root, v)] = self._score_candidate(n0, root, v) # Retrieve the candidates with highest score root, collapsable = max(mapper, key=mapper.get) @@ -318,16 +345,6 @@ def _make_reductions(self, partree): if not any(i.is_ParallelAtomic for i in partree.collapsed): return partree - # We bypass the corner case where a reduction might not be optimal, mainly: - # - Only the most inner loop is atomic - # In which case we can parallelize the perfect nest - # The opposite corner case (most outer loop atomic) - # should be detected before this pass - nc = len(partree.collapsed) - if nc > 1 and all(i.is_ParallelNoAtomic for i in partree.collapsed[:nc-1]): - mapper = {partree.root: partree.root._rebuild(ncollapsed=nc-1)} - return Transformer(mapper).visit(partree) - exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_reduction] reductions = [] @@ -586,6 +603,13 @@ def __init__(self, sregistry, options, platform, compiler): self.par_tile = UnboundTuple(options['par-tile']) self.par_disabled = options['par-disabled'] + def _score_candidate(self, n0, root, collapsable=()): + # `ndptrs`, the number of device pointers, part of the score too to + # ensure the outermost loop is offloaded + ndptrs = len(self._device_pointers(root)) + + return (ndptrs,) + super()._score_candidate(n0, root, collapsable) + def _make_threaded_prodders(self, partree): if isinstance(partree.root, self.DeviceIteration): # no-op for now diff --git a/tests/test_dle.py b/tests/test_dle.py index e7935dd45dc..3a94f46a9dd 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -863,8 +863,9 @@ def test_incs_no_atomic(self): op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(3)' in str(op0) - assert 'atomic' in str(op0) + assert 'omp for schedule' in str(op0) + assert 'collapse' not in str(op0) + assert 'atomic' not in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], @@ -877,22 +878,23 @@ def test_incs_no_atomic(self): def test_incr_perfect_outer(self): grid = Grid((5, 5)) d = Dimension(name="d") + u = Function(name="u", dimensions=(*grid.dimensions, d), grid=grid, shape=(*grid.shape, 5), ) v = Function(name="v", dimensions=(*grid.dimensions, d), grid=grid, shape=(*grid.shape, 5)) + w = Function(name="w", grid=grid) + u.data.fill(1) v.data.fill(2) - w = Function(name="w", grid=grid) - summation = Inc(w, u*v) - op0 = Operator([summation], opt=('advanced', {'openmp': True})) - assert 'reduction' not in str(op0) - assert 'omp for' in str(op0) + op = Operator([summation], opt=('advanced', {'openmp': True})) + assert 'reduction' not in str(op) + assert 'omp for' in str(op) - op0() + op() assert np.all(w.data == 10) @pytest.mark.parametrize('exprs,simd_level,expected', [