diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 2acccba648..91e68fc02b 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -214,9 +214,6 @@ def DeviceIteration(self): def Prodder(self): return self.lang.Prodder - def _device_pointers(self, *args, **kwargs): - return {} - class DeviceAwareMixin(object): diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index b6476192b2..29ba9a986b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -1,3 +1,5 @@ +from itertools import takewhile + import numpy as np import cgen as c from cached_property import cached_property @@ -254,6 +256,46 @@ def nthreads_nonaffine(self): def threadid(self): return self.sregistry.threadid + def _score_candidate(self, n0, root, collapsable=()): + """ + The score of a collapsable nest depends on the number of fully-parallel + Iterations and their position in the nest (the outer, the better). + """ + nest = [root] + list(collapsable) + n = len(nest) + + # Number of fully-parallel collapsable Iterations + key = lambda i: i.is_ParallelNoAtomic + fp_iters = list(takewhile(key, nest)) + n_fp_iters = len(fp_iters) + + # Number of parallel-if-atomic collapsable Iterations + key = lambda i: i.is_ParallelAtomic + pia_iters = list(takewhile(key, nest)) + n_pia_iters = len(pia_iters) + + # Prioritize the Dimensions that are more likely to define larger + # iteration spaces + key = lambda d: (not d.is_Derived or + (d.is_Custom and not is_integer(d.symbolic_size)) or + (d.is_Block and d._depth == 1)) + + fpdims = [i.dim for i in fp_iters] + n_fp_iters_large = len([d for d in fpdims if key(d)]) + + piadims = [i.dim for i in pia_iters] + n_pia_iters_large = len([d for d in piadims if key(d)]) + + return ( + int(n_fp_iters == n), # Fully-parallel nest + n_fp_iters_large, + n_fp_iters, + n_pia_iters_large, + n_pia_iters, + -(n0 + 1), # The outer, the better + n, + ) + def _select_candidates(self, candidates): assert candidates @@ -263,15 +305,18 @@ def _select_candidates(self, candidates): mapper = {} for n0, root in enumerate(candidates): + # Score `root` in isolation + mapper[(root, ())] = self._score_candidate(n0, root) + collapsable = [] for n, i in enumerate(candidates[n0+1:], n0+1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break - # Loops are collapsable only if none of the iteration variables appear - # in initializer expressions. For example, the following two loops - # cannot be collapsed + # Loops are collapsable only if none of the iteration variables + # appear in initializer expressions. For example, the following + # two loops cannot be collapsed # # for (i = ... ) # for (j = i ...) @@ -281,7 +326,7 @@ def _select_candidates(self, candidates): if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]): break - # Also, we do not want to collapse SIMD-vectorized Iterations + # Can't collapse SIMD-vectorized Iterations if i.is_Vectorized: break @@ -297,17 +342,9 @@ def _select_candidates(self, candidates): collapsable.append(i) - # Give a score to this candidate, based on the number of fully-parallel - # Iterations and their position (i.e. outermost to innermost) in the nest - score = ( - int(root.is_ParallelNoAtomic), - len(self._device_pointers(root)), # Outermost offloadable - int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), - int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), - -(n0 + 1) # The outermost, the better - ) - - mapper[(root, tuple(collapsable))] = score + # Score `root + collapsable` + v = tuple(collapsable) + mapper[(root, v)] = self._score_candidate(n0, root, v) # Retrieve the candidates with highest score root, collapsable = max(mapper, key=mapper.get) @@ -576,6 +613,13 @@ def __init__(self, sregistry, options, platform, compiler): self.par_tile = UnboundTuple(options['par-tile']) self.par_disabled = options['par-disabled'] + def _score_candidate(self, n0, root, collapsable=()): + # `ndptrs`, the number of device pointers, part of the score too to + # ensure the outermost loop is offloaded + ndptrs = len(self._device_pointers(root)) + + return (ndptrs,) + super()._score_candidate(n0, root, collapsable) + def _make_threaded_prodders(self, partree): if isinstance(partree.root, self.DeviceIteration): # no-op for now diff --git a/tests/test_adjoint.py b/tests/test_adjoint.py index 473e484c0e..759b86a3ae 100644 --- a/tests/test_adjoint.py +++ b/tests/test_adjoint.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from devito import Operator, norm, Function, Grid, SparseFunction +from devito import Operator, norm, Function, Grid, SparseFunction, inner from devito.logger import info from examples.seismic import demo_model, Receiver from examples.seismic.acoustic import acoustic_setup @@ -114,7 +114,7 @@ def test_adjoint_F(self, mkey, shape, kernel, space_order, time_order, setup_fun solver.adjoint(rec=rec, srca=srca) # Adjoint test: Verify matches closely - term1 = np.dot(srca.data.reshape(-1), solver.geometry.src.data) + term1 = inner(srca, solver.geometry.src) term2 = norm(rec) ** 2 info(': %f, : %f, difference: %4.4e, ratio: %f' % (term1, term2, (term1 - term2)/term1, term1 / term2)) @@ -231,6 +231,6 @@ def test_adjoint_inject_interpolate(self, shape, coords, npoints=19): # y => p # x => c # P^T y => a - term1 = np.dot(p2.data.reshape(-1), p.data.reshape(-1)) - term2 = np.dot(c.data.reshape(-1), a.data.reshape(-1)) + term1 = inner(p2, p) + term2 = inner(c, a) assert np.isclose((term1-term2) / term1, 0., atol=1.e-6) diff --git a/tests/test_dle.py b/tests/test_dle.py index ec24128983..03c0b533c9 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -284,7 +284,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -863,9 +863,9 @@ def test_incs_no_atomic(self): op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - - assert 'collapse(3)' in str(op0) - assert 'atomic' in str(op0) + assert 'omp for schedule' in str(op0) + assert 'collapse' not in str(op0) + assert 'atomic' not in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], @@ -875,6 +875,46 @@ def test_incs_no_atomic(self): assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + w = Function(name="w", grid=grid) + + u.data.fill(1) + v.data.fill(2) + + summation = Inc(w, u*v) + + op = Operator([summation], opt=('advanced', {'openmp': True})) + assert 'reduction' not in str(op) + assert 'omp for' in str(op) + + op() + assert np.all(w.data == 10) + + def test_incr_perfect_sparse_outer(self): + grid = Grid(shape=(3, 3, 3)) + + u = TimeFunction(name='u', grid=grid) + s = SparseTimeFunction(name='u', grid=grid, npoint=1, nt=11) + + eqns = s.inject(u, expr=s) + + op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0, + 'openmp': True})) + + iters = FindNodes(Iteration).visit(op) + assert len(iters) == 5 + assert iters[0].is_Sequential + assert all(i.is_ParallelAtomic for i in iters[1:]) + assert iters[1].pragmas[0].value == 'omp for schedule(dynamic,chunk_size)' + assert all(not i.pragmas for i in iters[2:]) + @pytest.mark.parametrize('exprs,simd_level,expected', [ (['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))', 'Inc(h1[0, 0], 1, implicit_dims=(t, x, y))'], diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 031bd9181b..071005464d 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -6,9 +6,9 @@ from conftest import assert_structure from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension, - MatrixSparseTimeFunction, SparseTimeFunction, SubDimension, - SubDomain, SubDomainSet, TimeFunction, Operator, configuration, - switchconfig) + Dimension, MatrixSparseTimeFunction, SparseTimeFunction, + SubDimension, SubDomain, SubDomainSet, TimeFunction, + Operator, configuration, switchconfig) from devito.arch import get_gpu_info from devito.exceptions import InvalidArgument from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols, @@ -110,6 +110,30 @@ def test_fission(self): assert np.all(usave.data[5:] == expected[5:]) assert np.all(vsave.data[:5] == expected[:5]) + def test_incr_perfect_outer(self): + grid = Grid((5, 5)) + d = Dimension(name="d") + + u = Function(name="u", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5), ) + v = Function(name="v", dimensions=(*grid.dimensions, d), + grid=grid, shape=(*grid.shape, 5)) + w = Function(name="w", grid=grid) + + u.data.fill(1) + v.data.fill(2) + + summation = Inc(w, u*v) + + op = Operator([summation]) + + assert 'reduction' not in str(op) + assert 'collapse(2)' in str(op) + assert 'parallel' in str(op) + + op() + assert np.all(w.data == 10) + class Bundle(SubDomain): """