From 25a5e6ddea4267f5deb14ee9d1dc720ec64b7207 Mon Sep 17 00:00:00 2001 From: Mathias Louboutin Date: Mon, 18 Sep 2023 11:56:45 -0400 Subject: [PATCH] compiler: prevent halo to be moved outside their iteration space --- devito/ir/stree/algorithms.py | 5 +++++ devito/mpi/halo_scheme.py | 4 ++++ devito/passes/iet/parpragma.py | 8 +++++++- tests/test_dle.py | 10 ++++++---- tests/test_gpu_common.py | 6 +++--- tests/test_gpu_openacc.py | 16 ++++++++-------- tests/test_gpu_openmp.py | 2 +- tests/test_mpi.py | 4 +++- 8 files changed, 37 insertions(+), 18 deletions(-) diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py index 58e8e844e69..422f62030ac 100644 --- a/devito/ir/stree/algorithms.py +++ b/devito/ir/stree/algorithms.py @@ -147,6 +147,11 @@ def preprocess(clusters, options=None, **kwargs): found = [] for c1 in list(queue): distributed_aindices = c1.halo_scheme.distributed_aindices + h_indices = c1.halo_scheme.loc_indices + + # Skip if the Halo echange would end up outside its need iteration space + if not all(getattr(d, 'd', d) in c.ispace.dimensions for d in h_indices): + continue diff = dims - distributed_aindices intersection = dims & distributed_aindices diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py index 0204c171e67..e63bbf5ac58 100644 --- a/devito/mpi/halo_scheme.py +++ b/devito/mpi/halo_scheme.py @@ -361,6 +361,10 @@ def distributed(self): def distributed_aindices(self): return set().union(*[i.dims for i in self.fmapper.values()]) + @cached_property + def loc_indices(self): + return set().union(*[i.loc_indices.values() for i in self.fmapper.values()]) + @cached_property def arguments(self): return self.dimensions | set(flatten(self.honored.values())) diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index 9d69e12df7f..9dd738b55f4 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -295,7 +295,7 @@ def _select_candidates(self, candidates): except TypeError: pass - collapsable.append(i) + collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest @@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None): ncollapsed=ncollapsed, nthreads=nthreads, **root.args) prefix = [] + elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None: + body = self.HostIteration(schedule='static', + parallel=nthreads is not self.nthreads_nested, + ncollapsed=ncollapsed, nthreads=nthreads, + **root.args) + prefix = [] else: # pragma ... for ... schedule(..., expr) assert nthreads is None diff --git a/tests/test_dle.py b/tests/test_dle.py index 3b9883e6652..df3c4adfa58 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject(): 'openmp': True, 'par-collapse-ncores': 1})) - assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], + assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'], 't,p_s0_blk0,p_s,rsx,rsy') @@ -821,12 +821,13 @@ def test_incs_no_atomic(self): 'par-collapse-ncores': 1, 'par-collapse-work': 0})) - assert 'collapse(2)' in str(op0) + assert 'collapse(3)' in str(op0) assert 'atomic' in str(op0) # Now only `x` is parallelized op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)], opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1})) + assert 'omp for' in str(op1) assert 'collapse' not in str(op1) assert 'atomic' not in str(op1) @@ -951,11 +952,12 @@ def test_parallel_prec_inject(self): eqns = sf.inject(field=u.forward, expr=sf * dt**2) op0 = Operator(eqns, opt=('advanced', {'openmp': True, - 'par-collapse-ncores': 1})) + 'par-collapse-ncores': 20})) iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas assert 'omp for' in iterations[1].pragmas[0].value + assert 'collapse' not in iterations[1].pragmas[0].value op0 = Operator(eqns, opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1, @@ -963,7 +965,7 @@ def test_parallel_prec_inject(self): iterations = FindNodes(Iteration).visit(op0) assert not iterations[0].pragmas - assert 'omp for' in iterations[1].pragmas[0].value + assert 'omp for collapse' in iterations[2].pragmas[0].value class TestNestedParallelism(object): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 031bd9181ba..628c04f099a 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -97,9 +97,9 @@ def test_fission(self): assert trees[0].root is trees[1].root assert trees[0][1] is not trees[1][1] assert trees[0].root.dim is time - assert not trees[0].root.pragmas - assert trees[0][1].pragmas - assert trees[1][1].pragmas + assert trees[0].root.pragmas + assert not trees[0][1].pragmas + assert not trees[0][2].pragmas op.apply() diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 823d11854de..db92db3c83f 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(32,4) present(u)' # Only the AFFINE Iterations are tiled - assert trees[3][1].pragmas[0].value ==\ - 'acc parallel loop collapse(3) present(src,src_coords,u)' + assert trees[4][1].pragmas[0].value ==\ + 'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)' @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)), ((32, 4, 4), (8, 8, 8))]) @@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile): opt=('advanced', {'par-tile': par_tile})) trees = retrieve_iteration_tree(op) - assert len(trees) == 4 + assert len(trees) == 6 - assert trees[0][1].pragmas[0].value ==\ - 'acc parallel loop tile(32,4,4) present(u)' assert trees[1][1].pragmas[0].value ==\ + 'acc parallel loop tile(32,4,4) present(u)' + assert trees[2][1].pragmas[0].value ==\ 'acc parallel loop tile(8,8) present(u)' def test_multi_tile_blocking_structure(self): diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index bc2de717082..29866508d85 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self): assert not tree.root.pragmas assert len(tree[1].pragmas) == 1 assert tree[1].pragmas[0].value ==\ - ('omp target teams distribute parallel for collapse(2)' + ('omp target teams distribute parallel for collapse(3)' ' reduction(+:f[0])') diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 2860fc726e4..d9d1127434e 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -1546,6 +1546,7 @@ def test_injection_wodup_wtime(self): sf.data[2, :] = 12. op = Operator(sf.inject(field=f, expr=sf + 1)) + print(op) op.apply() assert np.all(f.data[0] == 1.25) @@ -2558,7 +2559,8 @@ def test_adjoint_F_no_omp(self): # TestDecomposition().test_reshape_left_right() # TestOperatorSimple().test_trivial_eq_2d() # TestFunction().test_halo_exchange_bilateral() - TestSparseFunction().test_sparse_coords() + # TestSparseFunction().test_sparse_coords() # TestSparseFunction().test_precomputed_sparse(2) # TestOperatorAdvanced().test_fission_due_to_antidep() + TestOperatorAdvanced().test_injection_wodup_wtime() # TestIsotropicAcoustic().test_adjoint_F(1)