From 25a5e6ddea4267f5deb14ee9d1dc720ec64b7207 Mon Sep 17 00:00:00 2001
From: Mathias Louboutin <mathias.louboutin@gmail.com>
Date: Mon, 18 Sep 2023 11:56:45 -0400
Subject: [PATCH] compiler: prevent halo to be moved outside their iteration
 space

---
 devito/ir/stree/algorithms.py  |  5 +++++
 devito/mpi/halo_scheme.py      |  4 ++++
 devito/passes/iet/parpragma.py |  8 +++++++-
 tests/test_dle.py              | 10 ++++++----
 tests/test_gpu_common.py       |  6 +++---
 tests/test_gpu_openacc.py      | 16 ++++++++--------
 tests/test_gpu_openmp.py       |  2 +-
 tests/test_mpi.py              |  4 +++-
 8 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py
index 58e8e844e69..422f62030ac 100644
--- a/devito/ir/stree/algorithms.py
+++ b/devito/ir/stree/algorithms.py
@@ -147,6 +147,11 @@ def preprocess(clusters, options=None, **kwargs):
             found = []
             for c1 in list(queue):
                 distributed_aindices = c1.halo_scheme.distributed_aindices
+                h_indices = c1.halo_scheme.loc_indices
+
+                # Skip if the Halo echange would end up outside its need iteration space
+                if not all(getattr(d, 'd', d) in c.ispace.dimensions for d in h_indices):
+                    continue
 
                 diff = dims - distributed_aindices
                 intersection = dims & distributed_aindices
diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py
index 0204c171e67..e63bbf5ac58 100644
--- a/devito/mpi/halo_scheme.py
+++ b/devito/mpi/halo_scheme.py
@@ -361,6 +361,10 @@ def distributed(self):
     def distributed_aindices(self):
         return set().union(*[i.dims for i in self.fmapper.values()])
 
+    @cached_property
+    def loc_indices(self):
+        return set().union(*[i.loc_indices.values() for i in self.fmapper.values()])
+
     @cached_property
     def arguments(self):
         return self.dimensions | set(flatten(self.honored.values()))
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index 9d69e12df7f..9dd738b55f4 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -295,7 +295,7 @@ def _select_candidates(self, candidates):
                     except TypeError:
                         pass
 
-                    collapsable.append(i)
+                collapsable.append(i)
 
             # Give a score to this candidate, based on the number of fully-parallel
             # Iterations and their position (i.e. outermost to innermost) in the nest
@@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None):
                                           ncollapsed=ncollapsed, nthreads=nthreads,
                                           **root.args)
             prefix = []
+        elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None:
+            body = self.HostIteration(schedule='static',
+                                      parallel=nthreads is not self.nthreads_nested,
+                                      ncollapsed=ncollapsed, nthreads=nthreads,
+                                      **root.args)
+            prefix = []
         else:
             # pragma ... for ... schedule(..., expr)
             assert nthreads is None
diff --git a/tests/test_dle.py b/tests/test_dle.py
index 3b9883e6652..df3c4adfa58 100644
--- a/tests/test_dle.py
+++ b/tests/test_dle.py
@@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
                                           'openmp': True,
                                           'par-collapse-ncores': 1}))
 
-    assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
+    assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
                      't,p_s0_blk0,p_s,rsx,rsy')
 
 
@@ -821,12 +821,13 @@ def test_incs_no_atomic(self):
                                                      'par-collapse-ncores': 1,
                                                      'par-collapse-work': 0}))
 
-        assert 'collapse(2)' in str(op0)
+        assert 'collapse(3)' in str(op0)
         assert 'atomic' in str(op0)
 
         # Now only `x` is parallelized
         op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
                        opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1}))
+
         assert 'omp for' in str(op1)
         assert 'collapse' not in str(op1)
         assert 'atomic' not in str(op1)
@@ -951,11 +952,12 @@ def test_parallel_prec_inject(self):
         eqns = sf.inject(field=u.forward, expr=sf * dt**2)
 
         op0 = Operator(eqns, opt=('advanced', {'openmp': True,
-                                               'par-collapse-ncores': 1}))
+                                               'par-collapse-ncores': 20}))
         iterations = FindNodes(Iteration).visit(op0)
 
         assert not iterations[0].pragmas
         assert 'omp for' in iterations[1].pragmas[0].value
+        assert 'collapse' not in iterations[1].pragmas[0].value
 
         op0 = Operator(eqns, opt=('advanced', {'openmp': True,
                                                'par-collapse-ncores': 1,
@@ -963,7 +965,7 @@ def test_parallel_prec_inject(self):
         iterations = FindNodes(Iteration).visit(op0)
 
         assert not iterations[0].pragmas
-        assert 'omp for' in iterations[1].pragmas[0].value
+        assert 'omp for collapse' in iterations[2].pragmas[0].value
 
 
 class TestNestedParallelism(object):
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
index 031bd9181ba..628c04f099a 100644
--- a/tests/test_gpu_common.py
+++ b/tests/test_gpu_common.py
@@ -97,9 +97,9 @@ def test_fission(self):
         assert trees[0].root is trees[1].root
         assert trees[0][1] is not trees[1][1]
         assert trees[0].root.dim is time
-        assert not trees[0].root.pragmas
-        assert trees[0][1].pragmas
-        assert trees[1][1].pragmas
+        assert trees[0].root.pragmas
+        assert not trees[0][1].pragmas
+        assert not trees[0][2].pragmas
 
         op.apply()
 
diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py
index 823d11854de..db92db3c83f 100644
--- a/tests/test_gpu_openacc.py
+++ b/tests/test_gpu_openacc.py
@@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 4
+        assert len(trees) == 6
 
-        assert trees[0][1].pragmas[0].value ==\
-            'acc parallel loop tile(32,4,4) present(u)'
         assert trees[1][1].pragmas[0].value ==\
+            'acc parallel loop tile(32,4,4) present(u)'
+        assert trees[2][1].pragmas[0].value ==\
             'acc parallel loop tile(32,4) present(u)'
         # Only the AFFINE Iterations are tiled
-        assert trees[3][1].pragmas[0].value ==\
-            'acc parallel loop collapse(3) present(src,src_coords,u)'
+        assert trees[4][1].pragmas[0].value ==\
+            'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)'
 
     @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)),
                                           ((32, 4, 4), (8, 8, 8))])
@@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 4
+        assert len(trees) == 6
 
-        assert trees[0][1].pragmas[0].value ==\
-            'acc parallel loop tile(32,4,4) present(u)'
         assert trees[1][1].pragmas[0].value ==\
+            'acc parallel loop tile(32,4,4) present(u)'
+        assert trees[2][1].pragmas[0].value ==\
             'acc parallel loop tile(8,8) present(u)'
 
     def test_multi_tile_blocking_structure(self):
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
index bc2de717082..29866508d85 100644
--- a/tests/test_gpu_openmp.py
+++ b/tests/test_gpu_openmp.py
@@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
         assert not tree.root.pragmas
         assert len(tree[1].pragmas) == 1
         assert tree[1].pragmas[0].value ==\
-            ('omp target teams distribute parallel for collapse(2)'
+            ('omp target teams distribute parallel for collapse(3)'
              ' reduction(+:f[0])')
 
 
diff --git a/tests/test_mpi.py b/tests/test_mpi.py
index 2860fc726e4..d9d1127434e 100644
--- a/tests/test_mpi.py
+++ b/tests/test_mpi.py
@@ -1546,6 +1546,7 @@ def test_injection_wodup_wtime(self):
         sf.data[2, :] = 12.
 
         op = Operator(sf.inject(field=f, expr=sf + 1))
+        print(op)
         op.apply()
 
         assert np.all(f.data[0] == 1.25)
@@ -2558,7 +2559,8 @@ def test_adjoint_F_no_omp(self):
     # TestDecomposition().test_reshape_left_right()
     # TestOperatorSimple().test_trivial_eq_2d()
     # TestFunction().test_halo_exchange_bilateral()
-    TestSparseFunction().test_sparse_coords()
+    # TestSparseFunction().test_sparse_coords()
     # TestSparseFunction().test_precomputed_sparse(2)
     # TestOperatorAdvanced().test_fission_due_to_antidep()
+    TestOperatorAdvanced().test_injection_wodup_wtime()
     # TestIsotropicAcoustic().test_adjoint_F(1)