From f7ab007e0a735952ca9d40509fe7ac9ec222f55c Mon Sep 17 00:00:00 2001
From: Mathias Louboutin <mathias.louboutin@gmail.com>
Date: Mon, 18 Sep 2023 11:56:45 -0400
Subject: [PATCH] compiler: prevent halo to be moved outside their iteration
 space

---
 devito/ir/stree/algorithms.py  |  6 ++++++
 devito/mpi/halo_scheme.py      |  4 ++++
 devito/passes/iet/langbase.py  | 13 ++++++++++---
 devito/passes/iet/parpragma.py | 10 ++++++++--
 tests/test_dle.py              | 10 ++++++----
 tests/test_gpu_openacc.py      | 16 ++++++++--------
 tests/test_gpu_openmp.py       |  2 +-
 tests/test_mpi.py              |  3 ++-
 8 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/devito/ir/stree/algorithms.py b/devito/ir/stree/algorithms.py
index 58e8e844e6..d8bbb4958a 100644
--- a/devito/ir/stree/algorithms.py
+++ b/devito/ir/stree/algorithms.py
@@ -147,6 +147,12 @@ def preprocess(clusters, options=None, **kwargs):
             found = []
             for c1 in list(queue):
                 distributed_aindices = c1.halo_scheme.distributed_aindices
+                h_indices = set().union(*[(d, d.root)
+                                          for d in c1.halo_scheme.loc_indices])
+
+                # Skip if the Halo echange would end up outside its need iteration space
+                if h_indices and not h_indices & dims:
+                    continue
 
                 diff = dims - distributed_aindices
                 intersection = dims & distributed_aindices
diff --git a/devito/mpi/halo_scheme.py b/devito/mpi/halo_scheme.py
index 0204c171e6..970e84633d 100644
--- a/devito/mpi/halo_scheme.py
+++ b/devito/mpi/halo_scheme.py
@@ -361,6 +361,10 @@ def distributed(self):
     def distributed_aindices(self):
         return set().union(*[i.dims for i in self.fmapper.values()])
 
+    @cached_property
+    def loc_indices(self):
+        return set().union(*[i.loc_indices.keys() for i in self.fmapper.values()])
+
     @cached_property
     def arguments(self):
         return self.dimensions | set(flatten(self.honored.values()))
diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py
index 4a4f6ac465..457d8476c3 100644
--- a/devito/passes/iet/langbase.py
+++ b/devito/passes/iet/langbase.py
@@ -214,8 +214,8 @@ def DeviceIteration(self):
     def Prodder(self):
         return self.lang.Prodder
 
-    def _is_offloadable(self, *args, **kwargs):
-        return False
+    def _n_device_pointers(self, *args, **kwargs):
+        return 0
 
 
 class DeviceAwareMixin(object):
@@ -328,6 +328,12 @@ def _(iet):
 
         return _initialize(iet)
 
+    def _n_device_pointers(self, iet):
+        functions = FindSymbols().visit(iet)
+        devfuncs = [f for f in functions if f.is_Array and f._mem_local]
+
+        return len(devfuncs)
+
     def _is_offloadable(self, iet):
         """
         True if the IET computation is offloadable to device, False otherwise.
@@ -339,7 +345,8 @@ def _is_offloadable(self, iet):
         functions = FindSymbols().visit(iet)
         buffers = [f for f in functions if f.is_Array and f._mem_mapped]
         hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)]
-        return not (buffers and hostfuncs)
+
+        return not (hostfuncs and buffers)
 
 
 class Sections(tuple):
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index 9d69e12df7..34ca370a60 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -295,13 +295,13 @@ def _select_candidates(self, candidates):
                     except TypeError:
                         pass
 
-                    collapsable.append(i)
+                collapsable.append(i)
 
             # Give a score to this candidate, based on the number of fully-parallel
             # Iterations and their position (i.e. outermost to innermost) in the nest
             score = (
                 int(root.is_ParallelNoAtomic),
-                -int(self._is_offloadable(root))*(n0 + 1),  # Outermost offloadable
+                self._n_device_pointers(root),  # Outermost offloadable
                 int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
                 int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
                 -(n0 + 1)  # The outermost, the better
@@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None):
                                           ncollapsed=ncollapsed, nthreads=nthreads,
                                           **root.args)
             prefix = []
+        elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None:
+            body = self.HostIteration(schedule='static',
+                                      parallel=nthreads is not self.nthreads_nested,
+                                      ncollapsed=ncollapsed, nthreads=nthreads,
+                                      **root.args)
+            prefix = []
         else:
             # pragma ... for ... schedule(..., expr)
             assert nthreads is None
diff --git a/tests/test_dle.py b/tests/test_dle.py
index 3b9883e665..df3c4adfa5 100644
--- a/tests/test_dle.py
+++ b/tests/test_dle.py
@@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
                                           'openmp': True,
                                           'par-collapse-ncores': 1}))
 
-    assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
+    assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
                      't,p_s0_blk0,p_s,rsx,rsy')
 
 
@@ -821,12 +821,13 @@ def test_incs_no_atomic(self):
                                                      'par-collapse-ncores': 1,
                                                      'par-collapse-work': 0}))
 
-        assert 'collapse(2)' in str(op0)
+        assert 'collapse(3)' in str(op0)
         assert 'atomic' in str(op0)
 
         # Now only `x` is parallelized
         op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
                        opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1}))
+
         assert 'omp for' in str(op1)
         assert 'collapse' not in str(op1)
         assert 'atomic' not in str(op1)
@@ -951,11 +952,12 @@ def test_parallel_prec_inject(self):
         eqns = sf.inject(field=u.forward, expr=sf * dt**2)
 
         op0 = Operator(eqns, opt=('advanced', {'openmp': True,
-                                               'par-collapse-ncores': 1}))
+                                               'par-collapse-ncores': 20}))
         iterations = FindNodes(Iteration).visit(op0)
 
         assert not iterations[0].pragmas
         assert 'omp for' in iterations[1].pragmas[0].value
+        assert 'collapse' not in iterations[1].pragmas[0].value
 
         op0 = Operator(eqns, opt=('advanced', {'openmp': True,
                                                'par-collapse-ncores': 1,
@@ -963,7 +965,7 @@ def test_parallel_prec_inject(self):
         iterations = FindNodes(Iteration).visit(op0)
 
         assert not iterations[0].pragmas
-        assert 'omp for' in iterations[1].pragmas[0].value
+        assert 'omp for collapse' in iterations[2].pragmas[0].value
 
 
 class TestNestedParallelism(object):
diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py
index 823d11854d..db92db3c83 100644
--- a/tests/test_gpu_openacc.py
+++ b/tests/test_gpu_openacc.py
@@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 4
+        assert len(trees) == 6
 
-        assert trees[0][1].pragmas[0].value ==\
-            'acc parallel loop tile(32,4,4) present(u)'
         assert trees[1][1].pragmas[0].value ==\
+            'acc parallel loop tile(32,4,4) present(u)'
+        assert trees[2][1].pragmas[0].value ==\
             'acc parallel loop tile(32,4) present(u)'
         # Only the AFFINE Iterations are tiled
-        assert trees[3][1].pragmas[0].value ==\
-            'acc parallel loop collapse(3) present(src,src_coords,u)'
+        assert trees[4][1].pragmas[0].value ==\
+            'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)'
 
     @pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)),
                                           ((32, 4, 4), (8, 8, 8))])
@@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile):
                       opt=('advanced', {'par-tile': par_tile}))
 
         trees = retrieve_iteration_tree(op)
-        assert len(trees) == 4
+        assert len(trees) == 6
 
-        assert trees[0][1].pragmas[0].value ==\
-            'acc parallel loop tile(32,4,4) present(u)'
         assert trees[1][1].pragmas[0].value ==\
+            'acc parallel loop tile(32,4,4) present(u)'
+        assert trees[2][1].pragmas[0].value ==\
             'acc parallel loop tile(8,8) present(u)'
 
     def test_multi_tile_blocking_structure(self):
diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py
index bc2de71708..29866508d8 100644
--- a/tests/test_gpu_openmp.py
+++ b/tests/test_gpu_openmp.py
@@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
         assert not tree.root.pragmas
         assert len(tree[1].pragmas) == 1
         assert tree[1].pragmas[0].value ==\
-            ('omp target teams distribute parallel for collapse(2)'
+            ('omp target teams distribute parallel for collapse(3)'
              ' reduction(+:f[0])')
 
 
diff --git a/tests/test_mpi.py b/tests/test_mpi.py
index 2860fc726e..51facd7a7c 100644
--- a/tests/test_mpi.py
+++ b/tests/test_mpi.py
@@ -2558,7 +2558,8 @@ def test_adjoint_F_no_omp(self):
     # TestDecomposition().test_reshape_left_right()
     # TestOperatorSimple().test_trivial_eq_2d()
     # TestFunction().test_halo_exchange_bilateral()
-    TestSparseFunction().test_sparse_coords()
+    # TestSparseFunction().test_sparse_coords()
     # TestSparseFunction().test_precomputed_sparse(2)
     # TestOperatorAdvanced().test_fission_due_to_antidep()
+    TestOperatorAdvanced().test_injection_wodup_wtime()
     # TestIsotropicAcoustic().test_adjoint_F(1)