Skip to content

Commit

Permalink
compiler: prevent halo to be moved outside their iteration space
Browse files Browse the repository at this point in the history
  • Loading branch information
mloubout committed Sep 19, 2023
1 parent 87d8d0e commit f7ab007
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 19 deletions.
6 changes: 6 additions & 0 deletions devito/ir/stree/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ def preprocess(clusters, options=None, **kwargs):
found = []
for c1 in list(queue):
distributed_aindices = c1.halo_scheme.distributed_aindices
h_indices = set().union(*[(d, d.root)
for d in c1.halo_scheme.loc_indices])

# Skip if the Halo echange would end up outside its need iteration space
if h_indices and not h_indices & dims:
continue

diff = dims - distributed_aindices
intersection = dims & distributed_aindices
Expand Down
4 changes: 4 additions & 0 deletions devito/mpi/halo_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,10 @@ def distributed(self):
def distributed_aindices(self):
return set().union(*[i.dims for i in self.fmapper.values()])

@cached_property
def loc_indices(self):
return set().union(*[i.loc_indices.keys() for i in self.fmapper.values()])

@cached_property
def arguments(self):
return self.dimensions | set(flatten(self.honored.values()))
Expand Down
13 changes: 10 additions & 3 deletions devito/passes/iet/langbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ def DeviceIteration(self):
def Prodder(self):
return self.lang.Prodder

def _is_offloadable(self, *args, **kwargs):
return False
def _n_device_pointers(self, *args, **kwargs):
return 0


class DeviceAwareMixin(object):
Expand Down Expand Up @@ -328,6 +328,12 @@ def _(iet):

return _initialize(iet)

def _n_device_pointers(self, iet):
functions = FindSymbols().visit(iet)
devfuncs = [f for f in functions if f.is_Array and f._mem_local]

return len(devfuncs)

def _is_offloadable(self, iet):
"""
True if the IET computation is offloadable to device, False otherwise.
Expand All @@ -339,7 +345,8 @@ def _is_offloadable(self, iet):
functions = FindSymbols().visit(iet)
buffers = [f for f in functions if f.is_Array and f._mem_mapped]
hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)]
return not (buffers and hostfuncs)

return not (hostfuncs and buffers)


class Sections(tuple):
Expand Down
10 changes: 8 additions & 2 deletions devito/passes/iet/parpragma.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,13 @@ def _select_candidates(self, candidates):
except TypeError:
pass

collapsable.append(i)
collapsable.append(i)

# Give a score to this candidate, based on the number of fully-parallel
# Iterations and their position (i.e. outermost to innermost) in the nest
score = (
int(root.is_ParallelNoAtomic),
-int(self._is_offloadable(root))*(n0 + 1), # Outermost offloadable
self._n_device_pointers(root), # Outermost offloadable
int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
-(n0 + 1) # The outermost, the better
Expand Down Expand Up @@ -375,6 +375,12 @@ def _make_partree(self, candidates, nthreads=None):
ncollapsed=ncollapsed, nthreads=nthreads,
**root.args)
prefix = []
elif all(i.is_ParallelRelaxed for i in candidates) and nthreads is not None:
body = self.HostIteration(schedule='static',
parallel=nthreads is not self.nthreads_nested,
ncollapsed=ncollapsed, nthreads=nthreads,
**root.args)
prefix = []
else:
# pragma ... for ... schedule(..., expr)
assert nthreads is None
Expand Down
10 changes: 6 additions & 4 deletions tests/test_dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
'openmp': True,
'par-collapse-ncores': 1}))

assert_structure(op, ['t', 't,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
't,p_s0_blk0,p_s,rsx,rsy')


Expand Down Expand Up @@ -821,12 +821,13 @@ def test_incs_no_atomic(self):
'par-collapse-ncores': 1,
'par-collapse-work': 0}))

assert 'collapse(2)' in str(op0)
assert 'collapse(3)' in str(op0)
assert 'atomic' in str(op0)

# Now only `x` is parallelized
op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
opt=('advanced', {'openmp': True, 'par-collapse-ncores': 1}))

assert 'omp for' in str(op1)
assert 'collapse' not in str(op1)
assert 'atomic' not in str(op1)
Expand Down Expand Up @@ -951,19 +952,20 @@ def test_parallel_prec_inject(self):
eqns = sf.inject(field=u.forward, expr=sf * dt**2)

op0 = Operator(eqns, opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1}))
'par-collapse-ncores': 20}))
iterations = FindNodes(Iteration).visit(op0)

assert not iterations[0].pragmas
assert 'omp for' in iterations[1].pragmas[0].value
assert 'collapse' not in iterations[1].pragmas[0].value

op0 = Operator(eqns, opt=('advanced', {'openmp': True,
'par-collapse-ncores': 1,
'par-collapse-work': 1}))
iterations = FindNodes(Iteration).visit(op0)

assert not iterations[0].pragmas
assert 'omp for' in iterations[1].pragmas[0].value
assert 'omp for collapse' in iterations[2].pragmas[0].value


class TestNestedParallelism(object):
Expand Down
16 changes: 8 additions & 8 deletions tests/test_gpu_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,15 @@ def test_tile_insteadof_collapse(self, par_tile):
opt=('advanced', {'par-tile': par_tile}))

trees = retrieve_iteration_tree(op)
assert len(trees) == 4
assert len(trees) == 6

assert trees[0][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
assert trees[1][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
assert trees[2][1].pragmas[0].value ==\
'acc parallel loop tile(32,4) present(u)'
# Only the AFFINE Iterations are tiled
assert trees[3][1].pragmas[0].value ==\
'acc parallel loop collapse(3) present(src,src_coords,u)'
assert trees[4][1].pragmas[0].value ==\
'acc parallel loop present(src,src_coords,u) deviceptr(r1,r2,r3)'

@pytest.mark.parametrize('par_tile', [((32, 4, 4), (8, 8)), ((32, 4), (8, 8)),
((32, 4, 4), (8, 8, 8))])
Expand All @@ -130,11 +130,11 @@ def test_multiple_tile_sizes(self, par_tile):
opt=('advanced', {'par-tile': par_tile}))

trees = retrieve_iteration_tree(op)
assert len(trees) == 4
assert len(trees) == 6

assert trees[0][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
assert trees[1][1].pragmas[0].value ==\
'acc parallel loop tile(32,4,4) present(u)'
assert trees[2][1].pragmas[0].value ==\
'acc parallel loop tile(8,8) present(u)'

def test_multi_tile_blocking_structure(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_gpu_openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def test_timeparallel_reduction(self):
assert not tree.root.pragmas
assert len(tree[1].pragmas) == 1
assert tree[1].pragmas[0].value ==\
('omp target teams distribute parallel for collapse(2)'
('omp target teams distribute parallel for collapse(3)'
' reduction(+:f[0])')


Expand Down
3 changes: 2 additions & 1 deletion tests/test_mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2558,7 +2558,8 @@ def test_adjoint_F_no_omp(self):
# TestDecomposition().test_reshape_left_right()
# TestOperatorSimple().test_trivial_eq_2d()
# TestFunction().test_halo_exchange_bilateral()
TestSparseFunction().test_sparse_coords()
# TestSparseFunction().test_sparse_coords()
# TestSparseFunction().test_precomputed_sparse(2)
# TestOperatorAdvanced().test_fission_due_to_antidep()
TestOperatorAdvanced().test_injection_wodup_wtime()
# TestIsotropicAcoustic().test_adjoint_F(1)

0 comments on commit f7ab007

Please sign in to comment.