diff --git a/devito/core/operator.py b/devito/core/operator.py index 2de585a11e..887c79978a 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -329,9 +329,9 @@ class OptOption(object): class ParTileArg(tuple): - def __new__(cls, items, shm=0, tag=None): + def __new__(cls, items, rule=None, tag=None): obj = super().__new__(cls, items) - obj.shm = shm + obj.rule = rule obj.tag = tag return obj @@ -371,14 +371,15 @@ def __new__(cls, items, default=None): try: y = items[1] - if is_integer(y): - # E.g., ((32, 4, 8), 1) - # E.g., ((32, 4, 8), 1, 'tag') + if is_integer(y) or isinstance(y, str) or y is None: + # E.g., ((32, 4, 8), 'rule') + # E.g., ((32, 4, 8), 'rule', 'tag') items = (ParTileArg(*items),) else: try: - # E.g., (((32, 4, 8), 1), ((32, 4, 4), 2)) - # E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1')) + # E.g., (((32, 4, 8), 'rule'), ((32, 4, 4), 'rule')) + # E.g., (((32, 4, 8), 'rule0', 'tag0'), + # ((32, 4, 4), 'rule1', 'tag1')) items = tuple(ParTileArg(*i) for i in items) except TypeError: # E.g., ((32, 4, 8), (32, 4, 4)) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 1ca7e29506..ed4bab957a 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -346,6 +346,10 @@ def __new__(cls, *args, **kwargs): return obj def subs(self, *args, **kwargs): + if len(args) == 2: + old, new = args + if self == old: + return new return self.func(*[getattr(a, 'subs', lambda x: a)(*args, **kwargs) for a in self.args], evaluate=False) @@ -556,6 +560,9 @@ def __repr__(self): __str__ = __repr__ + def _sympystr(self, printer): + return str(self) + def _hashable_content(self): return super()._hashable_content() + (self.dimensions,) @@ -621,7 +628,7 @@ def __eq__(self, other): __hash__ = sympy.Basic.__hash__ def _hashable_content(self): - return (self.name, self.dimension, hash(tuple(self.weights))) + return (self.name, self.dimension, str(self.weights)) @property def dimension(self): @@ -665,6 +672,10 @@ def __new__(cls, expr, mapper, **kwargs): def _hashable_content(self): return super()._hashable_content() + (self.mapper,) + @cached_property + def base(self): + return self.expr.func(*[a for a in self.expr.args if a is not self.weights]) + @property def weights(self): return self._weights diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index e6e35f03fd..9eb4261b2b 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -207,9 +207,11 @@ def generic_derivative(expr, dim, fd_order, deriv_order, matvec=direct, x0=None, matvec, x0, symbolic, expand) -def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, expand): +def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, + expand): # The stencil indices - indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, x0=x0) + indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, + x0=x0) # Finite difference weights from Taylor approximation given these positions if symbolic: @@ -221,15 +223,24 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic weights = [sympify(w).evalf(_PRECISION) for w in weights] # Transpose the FD, if necessary - if matvec: - indices = indices.scale(matvec.val) + indices = indices.scale(matvec.val) # Shift index due to staggering, if any indices = indices.shift(-(expr.indices_ref[dim] - dim)) + # The user may wish to restrict expansion to selected derivatives + if callable(expand): + expand = expand(dim) + if not expand and indices.expr is not None: weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights) + if matvec == transpose: + # For homogenity, always generate e.g. `x + i0` rather than `x - i0` + # for transpose and `x + i0` for direct + indices = indices.transpose() + weights = weights._subs(indices.free_dim, -indices.free_dim) + # Inject the StencilDimension # E.g. `x + i*h_x` into `f(x)` s.t. `f(x + i*h_x)` expr = expr._subs(dim, indices.expr) diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py index ba112a3116..5ec3629403 100644 --- a/devito/finite_differences/tools.py +++ b/devito/finite_differences/tools.py @@ -197,6 +197,24 @@ def scale(self, v): return IndexSet(self.dim, indices, expr=expr, fd=self.free_dim) + def transpose(self): + """ + Transpose the IndexSet. + """ + indices = tuple(reversed(self)) + + free_dim = StencilDimension(self.free_dim.name, + -self.free_dim._max, + -self.free_dim._min, + backward=True) + + try: + expr = self.expr._subs(self.free_dim, -free_dim) + except AttributeError: + expr = None + + return IndexSet(self.dim, indices, expr=expr, fd=free_dim) + def shift(self, v): """ Construct a new IndexSet with all indices shifted by `v`. diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py index 2c0bdacc0c..d9150573c8 100644 --- a/devito/ir/clusters/algorithms.py +++ b/devito/ir/clusters/algorithms.py @@ -6,8 +6,8 @@ import sympy from devito.exceptions import InvalidOperator -from devito.ir.support import (Any, Backward, Forward, IterationSpace, - PARALLEL_IF_ATOMIC, pull_dims) +from devito.ir.support import (Any, Backward, Forward, IterationSpace, erange, + pull_dims) from devito.ir.clusters.analysis import analyze from devito.ir.clusters.cluster import Cluster, ClusterGroup from devito.ir.clusters.visitors import Queue, QueueStateful, cluster_pass @@ -121,10 +121,12 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog - # Try with increasingly smaller ClusterGroups until the ambiguity is gone + # Try with increasingly smaller ClusterGroups until the + # ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) - # Schedule Clusters over different IterationSpaces if this increases parallelism + # Schedule Clusters over different IterationSpaces if this increases + # parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, @@ -146,8 +148,8 @@ def callback(self, clusters, prefix, backlog=None, known_break=None): if not backlog: return processed - # Handle the backlog -- the Clusters characterized by flow- and anti-dependences - # along one or more Dimensions + # Handle the backlog -- the Clusters characterized by flow- and + # anti-dependences along one or more Dimensions idir = {d: Any for d in known_break} stamp = Stamp() for i, c in enumerate(list(backlog)): @@ -278,7 +280,11 @@ def callback(self, clusters, prefix): size = i.function.shape_allocated[d] assert is_integer(size) - mapper[size][si].add(iaf) + # Resolve StencilDimensions in case of unexpanded expressions + # E.g. `i0 + t` -> `(t - 1, t, t + 1)` + iafs = erange(iaf) + + mapper[size][si].update(iafs) # Construct the ModuloDimensions mds = [] @@ -288,7 +294,8 @@ def callback(self, clusters, prefix): # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 - siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) + key = lambda i: -np.inf if i - si == 0 else (i - si) + siafs = sorted(iafs, key=key) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) @@ -451,7 +458,8 @@ def normalize_reductions(cluster, sregistry, options): """ opt_mapify_reduce = options['mapify-reduce'] - dims = [d for d, v in cluster.properties.items() if PARALLEL_IF_ATOMIC in v] + dims = [d for d in cluster.ispace.itdims + if cluster.properties.is_parallel_atomic(d)] if not dims: return cluster diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index 0dc3200b4f..636c224e59 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -8,7 +8,7 @@ Forward, Interval, IntervalGroup, IterationSpace, DataSpace, Guards, Properties, Scope, detect_accesses, detect_io, normalize_properties, normalize_syncs, - sdims_min, sdims_max) + minimum, maximum) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.symbolics import estimate_cost from devito.tools import as_tuple, flatten, frozendict, infer_dtype @@ -52,13 +52,7 @@ def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None, # Normalize properties properties = Properties(properties or {}) - for d in ispace.itdimensions: - properties = properties.add(d) - for i in properties: - for d in as_tuple(i): - if d not in ispace.itdimensions: - properties = properties.drop(d) - self._properties = properties + self._properties = tailor_properties(properties, ispace) self._halo_scheme = halo_scheme @@ -85,10 +79,7 @@ def from_clusters(cls, *clusters): guards = root.guards - properties = {} - for c in clusters: - for d, v in c.properties.items(): - properties[d] = normalize_properties(properties.get(d, v), v) + properties = reduce_properties(clusters) try: syncs = normalize_syncs(*[c.syncs for c in clusters]) @@ -213,12 +204,10 @@ def is_dense(self): # at most PARALLEL_IF_PVT). This is a quick and easy check so we try it first try: pset = {PARALLEL, PARALLEL_IF_PVT} - grid = self.grid - for d in grid.dimensions: - if not any(pset & v for k, v in self.properties.items() - if d in k._defines): - raise ValueError - return True + target = set(self.grid.dimensions) + dims = {d for d in self.properties if d._defines & target} + if any(pset & self.properties[d] for d in dims): + return True except ValueError: pass @@ -276,8 +265,8 @@ def dspace(self): continue intervals = [Interval(d, - min([sdims_min(i) for i in offs]), - max([sdims_max(i) for i in offs])) + min([minimum(i) for i in offs]), + max([maximum(i) for i in offs])) for d, offs in v.items()] intervals = IntervalGroup(intervals) @@ -418,6 +407,10 @@ def scope(self): def ispace(self): return self._ispace + @cached_property + def properties(self): + return tailor_properties(reduce_properties(self), self.ispace) + @cached_property def guards(self): """The guards of each Cluster in self.""" @@ -425,8 +418,10 @@ def guards(self): @cached_property def syncs(self): - """The synchronization operations of each Cluster in self.""" - return tuple(i.syncs for i in self) + """ + A view of the ClusterGroup's synchronization operations. + """ + return normalize_syncs(*[c.syncs for c in self]) @cached_property def dspace(self): @@ -461,3 +456,26 @@ def meta(self): The data type and the data space of the ClusterGroup. """ return (self.dtype, self.dspace) + + +# *** Utils + +def reduce_properties(clusters): + properties = {} + for c in clusters: + for d, v in c.properties.items(): + properties[d] = normalize_properties(properties.get(d, v), v) + + return Properties(properties) + + +def tailor_properties(properties, ispace): + for d in ispace.itdimensions: + properties = properties.add(d) + + for i in properties: + for d in as_tuple(i): + if d not in ispace.itdimensions: + properties = properties.drop(d) + + return properties diff --git a/devito/ir/support/__init__.py b/devito/ir/support/__init__.py index 2b20e3ab20..8a8b50bc67 100644 --- a/devito/ir/support/__init__.py +++ b/devito/ir/support/__init__.py @@ -1,5 +1,5 @@ -from .utils import * # noqa from .vector import * # noqa +from .utils import * # noqa from .basic import * # noqa from .space import * # noqa from .guards import * # noqa diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 36ea735109..b73b49befe 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -4,15 +4,16 @@ from sympy import S from devito.ir.support.space import Backward, IterationSpace -from devito.ir.support.utils import AccessMode +from devito.ir.support.utils import AccessMode, extrema from devito.ir.support.vector import LabeledVector, Vector -from devito.symbolics import (retrieve_terminals, q_constant, q_affine, q_routine, - q_terminal) +from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals, + q_constant, q_affine, q_routine, search, uxreplace) from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten, memoized_meth, memoized_generator) -from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol +from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple, + Jump, Symbol) -__all__ = ['IterationInstance', 'TimedAccess', 'Scope'] +__all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry'] class IndexMode(Tag): @@ -236,7 +237,7 @@ def __eq__(self, other): # which might require expensive comparisons of Vector entries (i.e., # SymPy expressions) - return (self.access is other.access and # => self.function is other.function + return (self.access is other.access and self.mode == other.mode and self.ispace == other.ispace) @@ -322,6 +323,12 @@ def distance(self, other): other : TimedAccess The TimedAccess w.r.t. which the distance is computed. """ + if isinstance(self.access, ComponentAccess) and \ + isinstance(other.access, ComponentAccess) and \ + self.access.index != other.access.index: + # E.g., `uv(x).x` and `uv(x).y` -- not a real dependence! + return Vector(S.ImaginaryUnit) + ret = [] for sit, oit in zip(self.itintervals, other.itintervals): n = len(ret) @@ -810,9 +817,9 @@ def __init__(self, exprs, rules=None): for i, e in enumerate(exprs): # Reads - terminals = retrieve_terminals(e.rhs, deep=True, mode='unique') + terminals = retrieve_accesses(e.rhs, deep=True) try: - terminals.update(retrieve_terminals(e.lhs.indices)) + terminals.update(retrieve_accesses(e.lhs.indices)) except AttributeError: pass for j in terminals: @@ -821,12 +828,10 @@ def __init__(self, exprs, rules=None): v.append(TimedAccess(j, mode, i, e.ispace)) # Write - terminals = [] - if q_terminal(e.lhs): - terminals.append(e.lhs) + terminals = retrieve_accesses(e.lhs) if q_routine(e.rhs): try: - terminals.extend(e.rhs.writes) + terminals.update(e.rhs.writes) except AttributeError: # E.g., foreign routines, such as `cos` or `sin` pass @@ -846,7 +851,7 @@ def __init__(self, exprs, rules=None): # Look up ConditionalDimensions for v in e.conditionals.values(): - for j in retrieve_terminals(v): + for j in retrieve_accesses(v): v = self.reads.setdefault(j.function, []) v.append(TimedAccess(j, 'R', -1, e.ispace)) @@ -860,7 +865,7 @@ def __init__(self, exprs, rules=None): # Objects altering the control flow (e.g., synchronization barriers, # break statements, ...) are converted into mock dependences for i, e in enumerate(exprs): - if e.find(Barrier) | e.find(Jump): + if isinstance(e.rhs, (Barrier, Jump)): self.writes.setdefault(mocksym, []).append( TimedAccess(mocksym, 'W', i, e.ispace) ) @@ -883,7 +888,8 @@ def __getitem__(self, function): return self.getwrites(function) + self.getreads(function) def __repr__(self): - tracked = filter_sorted(set(self.reads) | set(self.writes), key=lambda i: i.name) + tracked = filter_sorted(set(self.reads) | set(self.writes), + key=lambda i: i.name) maxlen = max(1, max([len(i.name) for i in tracked])) out = "{:>%d} => W : {}\n{:>%d} R : {}" % (maxlen, maxlen) pad = " "*(maxlen + 9) @@ -908,6 +914,19 @@ def __repr__(self): return "\n".join([out.format(i.name, w, '', r) for i, r, w in zip(tracked, reads, writes)]) + @cached_property + def reads_extremaed(self): + """ + A view of the Scope's reads in which StencilDimensions are replaced + with their extrema. + """ + ret = {f: [] for f in self.reads} + for f, v in self.reads.items(): + for i in v: + for j in set(extrema(i.access)): + ret[f].append(TimedAccess(j, i.mode, i.timestamp, i.ispace)) + return ret + @cached_property def accesses(self): groups = list(self.reads.values()) + list(self.writes.values()) @@ -933,9 +952,12 @@ def d_flow_gen(self): """Generate the flow (or "read-after-write") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_extremaed.get(k, []): dependence = Dependence(w, r) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -946,8 +968,7 @@ def d_flow_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_flow = (not dependence.is_imaginary and - not r.is_read_reduction) + is_flow = not r.is_read_reduction if is_flow: yield dependence @@ -961,9 +982,12 @@ def d_anti_gen(self): """Generate the anti (or "write-after-read") dependences.""" for k, v in self.writes.items(): for w in v: - for r in self.reads.get(k, []): + for r in self.reads_extremaed.get(k, []): dependence = Dependence(r, w) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -974,8 +998,7 @@ def d_anti_gen(self): # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence, unless # it's a read-for-reduction - is_anti = (not dependence.is_imaginary and - not r.is_read_reduction) + is_anti = not r.is_read_reduction if is_anti: yield dependence @@ -992,6 +1015,9 @@ def d_output_gen(self): for w2 in self.writes.get(k, []): dependence = Dependence(w2, w1) + if dependence.is_imaginary: + continue + if any(not rule(dependence) for rule in self.rules): continue @@ -1001,7 +1027,7 @@ def d_output_gen(self): except TypeError: # Non-integer vectors are not comparable. # Conservatively, we assume it is a dependence - is_output = not dependence.is_imaginary + is_output = True if is_output: yield dependence @@ -1028,7 +1054,7 @@ def d_from_access_gen(self, accesses): accesses = as_tuple(accesses) for d in self.d_all_gen(): for i in accesses: - if d.source is i or d.sink is i: + if d.source == i or d.sink == i: yield d break @@ -1064,3 +1090,147 @@ def r_all(self): All Relations of the Scope. """ return list(self.r_gen()) + + +class ExprGeometry(object): + + """ + Geometric representation of an expression by abstracting Indexeds as + LabeledVectors. + """ + + def __init__(self, expr, indexeds=None, bases=None, offsets=None): + self.expr = expr + + if indexeds is not None: + self.indexeds = indexeds + self.bases = bases + self.offsets = offsets + return + + self.indexeds = retrieve_indexed(expr) + + bases = [] + offsets = [] + for ii in self.iinstances: + base = [] + offset = [] + for e, fi, ai in zip(ii, ii.findices, ii.aindices): + if ai is None: + base.append((fi, e)) + else: + base.append((fi, ai)) + offset.append((ai, e - ai)) + bases.append(LabeledVector(base)) + offsets.append(LabeledVector(offset)) + + self.bases = bases + self.offsets = offsets + + def __repr__(self): + return "ExprGeometry(expr=%s)" % self.expr + + def translated(self, other, dims=None): + """ + True if `self` is translated w.r.t. `other`, False otherwise. + + Examples + -------- + Two expressions are translated if they perform the same operations, + their bases are the same and their offsets are pairwise translated. + + c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} + u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} + + Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` + + The test may be strengthen by imposing that a translation occurs + only along a specific set of Dimensions through the kwarg `dims`. + """ + # Check mathematical structure + if not compare_ops(self.expr, other.expr): + return False + + # Use a suitable value for `dims` if not provided by user + if dims is None: + if self.aindices != other.aindices: + return False + dims = self.aindices + dims = set(as_tuple(dims)) + + # Check bases and offsets + for i in ['Tbases', 'Toffsets']: + Ti0 = getattr(self, i) + Ti1 = getattr(other, i) + + m0 = dict(Ti0) + m1 = dict(Ti1) + + # The only hope in presence of Dimensions appearing only in either + # `self` or `other` is that they have been projected away by the caller + for d in set(m0).symmetric_difference(set(m1)): + if not d._defines & dims: + return False + + for d in set(m0).union(set(m1)): + try: + o0 = m0[d] + o1 = m1[d] + except KeyError: + continue + + distance = set(o0 - o1) + if len(distance) != 1: + return False + + if not d._defines & dims: + if distance.pop() != 0: + return False + + return True + + @cached_property + def iinstances(self): + return tuple(IterationInstance(i) for i in self.indexeds) + + @cached_property + def Tbases(self): + return LabeledVector.transpose(*self.bases) + + @cached_property + def Toffsets(self): + return LabeledVector.transpose(*self.offsets) + + @cached_property + def dimensions(self): + return frozenset(i for i, _ in self.Toffsets) + + @cached_property + def aindices(self): + try: + return tuple(zip(*self.Toffsets))[0] + except IndexError: + return () + + @property + def is_regular(self): + return all(i.is_regular for i in self.iinstances) + + +# *** Utils + +def retrieve_accesses(exprs, **kwargs): + """ + Like retrieve_terminals, but ensure that if a ComponentAccess is found, + the ComponentAccess itself is returned, while the wrapped Indexed is discarded. + """ + kwargs['mode'] = 'unique' + + compaccs = search(exprs, ComponentAccess) + if not compaccs: + return retrieve_terminals(exprs, **kwargs) + + subs = {i: Symbol('dummy%d' % n) for n, i in enumerate(compaccs)} + exprs1 = uxreplace(exprs, subs) + + return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values()) diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index dd1bcc01a5..a378929fbc 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -8,7 +8,7 @@ from devito.ir.support.space import Forward, IterationDirection from devito.symbolics import CondEq, CondNe -from devito.tools import Pickable, as_tuple, frozendict +from devito.tools import Pickable, as_tuple, frozendict, split from devito.types import Dimension __all__ = ['GuardFactor', 'GuardBound', 'GuardBoundNext', 'BaseGuardBound', @@ -228,7 +228,7 @@ def andg(self, d, guard): return Guards(m) try: - m[d] = And(m[d], guard) + m[d] = and_smart(m[d], guard) except KeyError: m[d] = guard @@ -269,3 +269,40 @@ def filter(self, key): m = {d: v for d, v in self.items() if key(d)} return Guards(m) + + +# *** Utils + +def and_smart(relation, v): + """ + Given `x = And(*relation.args, v)`, return `relation` if `x ≡ relation`, + `x` otherwise. + + SymPy doesn't have a builtin function to simplify boolean inequalities; here, + we run a set of simple checks to at least discard the most obvious (and thus + annoying to see in the generated code) redundancies. + """ + if isinstance(relation, And): + candidates, other = split(list(relation.args), lambda a: type(a) is type(v)) + elif type(relation) is type(v): + candidates, other = [relation], [] + else: + candidates, other = [], [relation, v] + + covered = False + new_args = [] + for a in candidates: + if a.lhs is v.lhs: + covered = True + if type(a) in (Gt, Ge) and v.rhs > a.rhs: + new_args.append(v) + elif type(a) in (Lt, Le) and v.rhs < a.rhs: + new_args.append(v) + else: + new_args.append(a) + else: + new_args.append(a) + if not covered: + new_args.append(v) + + return And(*(new_args + other)) diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index f4ab873575..dfc7a7d4c2 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -208,6 +208,9 @@ def is_parallel(self, dims): def is_parallel_relaxed(self, dims): return any(len(self[d] & PARALLELS) > 0 for d in as_tuple(dims)) + def is_parallel_atomic(self, dims): + return any(PARALLEL_IF_ATOMIC in self.get(d, ()) for d in as_tuple(dims)) + def is_affine(self, dims): return any(AFFINE in self.get(d, ()) for d in as_tuple(dims)) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index b85b017b4a..529258bada 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -6,7 +6,7 @@ from cached_property import cached_property from sympy import Expr -from devito.ir.support.utils import sdims_min, sdims_max +from devito.ir.support.utils import minimum, maximum from devito.ir.support.vector import Vector, vmin, vmax from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, filter_ordered, flatten, frozendict, is_integer, toposort) @@ -286,8 +286,9 @@ def promote(self, cond): return self def expand(self): - return Interval(self.dim, sdims_min(self.lower), sdims_max(self.upper), - self.stamp) + return Interval( + self.dim, minimum(self.lower), maximum(self.upper), self.stamp + ) class IntervalGroup(PartialOrderTuple): @@ -904,6 +905,18 @@ def promote(self, cond): return IterationSpace(intervals, sub_iterators, directions) + def prefix(self, key): + """ + Return the IterationSpace up to and including the last Interval + such that `key(interval)` is True. + """ + try: + i = self.project(key)[-1] + except IndexError: + return None + + return self[:self.index(i.dim) + 1] + def is_compatible(self, other): """ A relaxed version of ``__eq__``, in which only non-derived dimensions @@ -916,6 +929,10 @@ def is_compatible(self, other): def itdimensions(self): return self.intervals.dimensions + @property + def itdims(self): + return self.itdimensions + @property def relations(self): return self.intervals.relations diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index 5f08f48020..2b50073d2d 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -1,12 +1,16 @@ -from collections import defaultdict +from collections import defaultdict, namedtuple +from itertools import product -from devito.symbolics import CallFromPointer, retrieve_indexed, retrieve_terminals +from devito.finite_differences import IndexDerivative +from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, + search) from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension, StencilDimension) __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', - 'pull_dims', 'sdims_min', 'sdims_max'] + 'pull_dims', 'unbounded', 'minimum', 'maximum', 'minmax_index', + 'extrema', 'erange'] class AccessMode(object): @@ -271,25 +275,91 @@ def pull_dims(exprs, flag=True): # *** Utility functions for expressions that potentially contain StencilDimensions -def sdims_min(expr): +def unbounded(expr): """ - Replace all StencilDimensions in `expr` with their minimum point. + Retrieve all unbounded Dimensions in `expr`. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: - return expr - mapper = {e: e._min for e in sdims} - return expr.subs(mapper) + # At the moment we only have logic to retrieve unbounded StencilDimensions, + # but in the future this might change + bound = set().union(*[i.dimensions for i in search(expr, IndexDerivative)]) + sdims = search(expr, StencilDimension, mode='unique', deep=True) + + return sdims - bound + + +Extrema = namedtuple('Extrema', 'm M') -def sdims_max(expr): +def _minmax(expr, callback, udims=None): """ - Replace all StencilDimensions in `expr` with their maximum point. + Helper for `minimum`, `maximum`, and potential future utilities that share + a significant chunk of logic. """ - try: - sdims = expr.find(StencilDimension) - except AttributeError: + if not udims: + udims = unbounded(expr) + + # Resolution rule 1: StencilDimensions + sdims = [d for d in udims if d.is_Stencil] + if not sdims: return expr - mapper = {e: e._max for e in sdims} + mapper = {e: callback(e) for e in sdims} + return expr.subs(mapper) + + +def minimum(expr, udims=None): + """ + Substitute the unbounded Dimensions in `expr` with their minimum point. + + Unbounded Dimensions whose possible minimum value is not known are ignored. + """ + return _minmax(expr, lambda e: e._min, udims) + + +def maximum(expr, udims=None): + """ + Substitute the unbounded Dimensions in `expr` with their maximum point. + + Unbounded Dimensions whose possible maximum value is not known are ignored. + """ + return _minmax(expr, lambda e: e._max, udims) + + +def extrema(expr): + """ + The minimum and maximum extrema assumed by `expr` once the unbounded + Dimensions are resolved. + """ + return Extrema(minimum(expr), maximum(expr)) + + +def minmax_index(expr, d): + """ + Return the minimum and maximum indices along the `d` Dimension + among all Indexeds in `expr`. + """ + indices = set() + for i in retrieve_indexed(expr): + try: + indices.add(i.indices[d]) + except KeyError: + pass + + return Extrema(min(minimum(i) for i in indices), + max(maximum(i) for i in indices)) + + +def erange(expr): + """ + All possible values that `expr` can assume once its unbounded Dimensions + are resolved. + """ + udims = unbounded(expr) + if not udims: + return (expr,) + + sdims = [d for d in udims if d.is_Stencil] + ranges = [i.range for i in sdims] + mappers = [dict(zip(sdims, i)) for i in product(*ranges)] + + return tuple(expr.subs(m) for m in mappers) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 26202a419b..7b31ba65db 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -311,8 +311,15 @@ def _lower_exprs(cls, expressions, **kwargs): # Specialization is performed on unevaluated expressions expressions = cls._specialize_dsl(expressions, **kwargs) - # Lower functional DSL + # Lower FD derivatives + # NOTE: we force expansion of derivatives along SteppingDimensions + # because it drastically simplifies the subsequent lowering into + # ModuloDimensions + if not expand: + expand = lambda d: d.is_Stepping expressions = flatten([i._evaluate(expand=expand) for i in expressions]) + + # Scalarize the tensor equations, if any expressions = [j for i in expressions for j in i._flatten] # A second round of specialization is performed on evaluated expressions @@ -922,15 +929,22 @@ def _emit_apply_profiling(self, args): # Emit local, i.e. "per-rank" performance. Without MPI, this is the only # thing that will be emitted - for k, v in summary.items(): - rank = "[rank%d]" % k.rank if k.rank is not None else "" + def lower_perfentry(v): if v.gflopss: oi = "OI=%.2f" % fround(v.oi) gflopss = "%.2f GFlops/s" % fround(v.gflopss) gpointss = "%.2f GPts/s" % fround(v.gpointss) - metrics = "[%s]" % ", ".join([oi, gflopss, gpointss]) + return "[%s]" % ", ".join([oi, gflopss, gpointss]) + elif v.gpointss: + gpointss = "%.2f GPts/s" % fround(v.gpointss) + return "[%s]" % gpointss else: - metrics = "" + return "" + + for k, v in summary.items(): + rank = "[rank%d]" % k.rank if k.rank is not None else "" + + metrics = lower_perfentry(v) itershapes = [",".join(str(i) for i in its) for its in v.itershapes] if len(itershapes) > 1: @@ -942,9 +956,12 @@ def _emit_apply_profiling(self, args): name = "%s%s<%s>" % (k.name, rank, itershapes) perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics)) - for n, time in summary.subsections.get(k.name, {}).items(): - perf("%s+ %s ran in %.2f s [%.2f%%]" % - (indent*2, n, time, fround(time/v.time*100))) + for n, v1 in summary.subsections.get(k.name, {}).items(): + metrics = lower_perfentry(v1) + + perf("%s+ %s ran in %.2f s [%.2f%%] %s" % + (indent*2, n, fround(v1.time), fround(v1.time/v.time*100), + metrics)) # Emit performance mode and arguments perf_args = {} diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py index 2a58677481..53f16e526f 100644 --- a/devito/operator/profiling.py +++ b/devito/operator/profiling.py @@ -105,6 +105,25 @@ def track_subsection(self, sname, name): v = self._subsections.setdefault(sname, OrderedDict()) v[name] = SectionData(S.Zero, S.Zero, S.Zero, S.Zero, []) + def group_as_subsections(self, sname, sections): + ops = sum(self._sections[i].ops for i in sections) + points = sum(self._sections[i].points for i in sections) + traffic = sum(self._sections[i].traffic for i in sections) + sectiondata = SectionData(ops, S.Zero, points, traffic, []) + + v = self._subsections.setdefault(sname, OrderedDict()) + v.update({i: self._sections[i] for i in sections}) + + new_sections = OrderedDict() + for k, v in self._sections.items(): + try: + if sections.index(k) == len(sections) - 1: + new_sections[sname] = sectiondata + except ValueError: + new_sections[k] = v + self._sections.clear() + self._sections.update(new_sections) + def instrument(self, iet, timer): """ Instrument the given IET for C-level performance profiling. @@ -211,6 +230,58 @@ class AdvancedProfiler(Profiler): _supports_async_sections = True + def _evaluate_section(self, name, data, args, dtype): + # Time to run the section + time = max(getattr(args[self.name]._obj, name), 10e-7) + + # Number of FLOPs performed + try: + ops = int(subs_op_args(data.ops, args)) + except (AttributeError, TypeError): + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + ops = np.nan + + try: + # Number of grid points computed + points = int(subs_op_args(data.points, args)) + + # Compulsory traffic + traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) + except (AttributeError, TypeError): + # E.g., the section has a dynamic loop size + points = np.nan + + traffic = np.nan + + # Nmber of FLOPs performed at each iteration + sops = data.sops + + # Runtime itermaps/itershapes + try: + itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) + for k, v in i.items()]) + for i in data.itermaps] + itershapes = tuple(tuple(i.values()) for i in itermaps) + except TypeError: + # E.g., a section comprising just function calls, or at least + # a sequence of unrecognized or non-conventional expr statements + itershapes = () + + return time, ops, points, traffic, sops, itershapes + + def _allgather_from_comm(self, comm, time, ops, points, traffic, sops, itershapes): + times = comm.allgather(time) + assert comm.size == len(times) + + opss = comm.allgather(ops) + pointss = comm.allgather(points) + traffics = comm.allgather(traffic) + sops = [sops]*comm.size + itershapess = comm.allgather(itershapes) + + return list(zip(times, opss, pointss, traffics, sops, itershapess)) + # Override basic summary so that arguments other than runtime are computed. def summary(self, args, dtype, reduce_over=None): grid = args.grid @@ -219,71 +290,30 @@ def summary(self, args, dtype, reduce_over=None): # Produce sections summary summary = PerformanceSummary() for name, data in self._sections.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) - - # Number of FLOPs performed - try: - ops = int(subs_op_args(data.ops, args)) - except (AttributeError, TypeError): - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - ops = np.nan - - try: - # Number of grid points computed - points = int(subs_op_args(data.points, args)) - - # Compulsory traffic - traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize) - except (AttributeError, TypeError): - # E.g., the section has a dynamic loop size - points = np.nan - - traffic = np.nan - - # Runtime itermaps/itershapes - try: - itermaps = [OrderedDict([(k, int(subs_op_args(v, args))) - for k, v in i.items()]) - for i in data.itermaps] - itershapes = tuple(tuple(i.values()) for i in itermaps) - except TypeError: - # E.g., a section comprising just function calls, or at least - # a sequence of unrecognized or non-conventional expr statements - itershapes = () + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) - opss = comm.allgather(ops) - pointss = comm.allgather(points) - traffics = comm.allgather(traffic) - sops = [data.sops]*comm.size - itershapess = comm.allgather(itershapes) - items = list(zip(times, opss, pointss, traffics, sops, itershapess)) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): summary.add(name, rank, *items[rank]) else: - summary.add(name, None, time, ops, points, traffic, data.sops, itershapes) + summary.add(name, None, *items) # Enrich summary with subsections data for sname, v in self._subsections.items(): for name, data in v.items(): - # Time to run the section - time = max(getattr(args[self.name]._obj, name), 10e-7) + items = self._evaluate_section(name, data, args, dtype) # Add local performance data if comm is not MPI.COMM_NULL: # With MPI enabled, we add one entry per section per rank - times = comm.allgather(time) - assert comm.size == len(times) + items = self._allgather_from_comm(comm, *items) for rank in range(comm.size): - summary.add_subsection(sname, name, rank, time) + summary.add_subsection(sname, name, rank, *items[rank]) else: - summary.add_subsection(sname, name, None, time) + summary.add_subsection(sname, name, None, *items) # Add global performance data if reduce_over is not None: @@ -422,11 +452,11 @@ def add(self, name, rank, time, self.input[k] = PerfInput(time, ops, points, traffic, sops, itershapes) - def add_subsection(self, sname, name, rank, time): + def add_subsection(self, sname, name, rank, time, *args): k0 = PerfKey(sname, rank) assert k0 in self - self.subsections[sname][name] = time + self.subsections[sname][name] = PerfEntry(time, None, None, None, None, []) def add_glb_vanilla(self, key, time): """ diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 1e8626da18..a7a1c57422 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -6,18 +6,19 @@ import numpy as np import sympy -from devito.finite_differences import EvalDerivative +from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward, - IterationInstance, IterationSpace, Interval, Cluster, - Queue, IntervalGroup, LabeledVector, normalize_properties, - relax_properties, sdims_min, sdims_max) -from devito.symbolics import (Uxmapper, compare_ops, estimate_cost, q_constant, - reuse_if_untouched, retrieve_indexed, search, uxreplace) -from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator, - split, timed_pass) -from devito.types import (Array, TempFunction, Eq, Symbol, Temp, ModuloDimension, - CustomDimension, IncrDimension, StencilDimension, Indexed, - Hyperplane) + IterationSpace, Interval, Cluster, ExprGeometry, Queue, + IntervalGroup, LabeledVector, Vector, normalize_properties, + relax_properties, unbounded, minimum, maximum, extrema, + vmax, vmin) +from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched, + uxreplace) +from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, + is_integer, generator, split, timed_pass) +from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction, + ModuloDimension, CustomDimension, IncrDimension, + StencilDimension, Indexed, Hyperplane) from devito.types.grid import MultiSubDimension __all__ = ['cire'] @@ -82,10 +83,13 @@ def cire(clusters, mode, sregistry, options, platform): t0 = 2.0*t2[x,y,z] t1 = 3.0*t2[x,y,z+1] """ - modes = { - 'invariants': [CireInvariantsElementary, CireInvariantsDivs], - 'sops': [CireSops], - } + # NOTE: Handle prematurely expanded derivatives -- current default on + # several backends, but soon to become legacy + if mode == 'sops': + if options['expand']: + mode = 'eval-derivs' + else: + mode = 'index-derivs' for cls in modes[mode]: transformer = cls(sregistry, options, platform) @@ -120,19 +124,17 @@ def _aliases_from_clusters(self, clusters, exclude, meta): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) - if not aliases: - continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) - # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] - schedule, exprs = pick_best(variants, self.opt_schedule_strategy, - self._eval_variants_delta) + + # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) + schedule, exprs = self._select(variants) # Schedule -> Schedule (optimization) if self.opt_rotate: @@ -140,7 +142,8 @@ def _aliases_from_clusters(self, clusters, exclude, meta): schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k - processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) + processed, subs = lower_schedule(schedule, meta, self.sregistry, + self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: @@ -162,6 +165,46 @@ def _aliases_from_clusters(self, clusters, exclude, meta): return processed + def process(self, clusters): + raise NotImplementedError + + def _generate(self, exprs, exclude): + """ + Generate one or more extractions from ``exprs``. An extraction is a + set of CIRE candidates which may be turned into aliases. Two different + extractions may contain overlapping sub-expressions and, therefore, + should be processed and evaluated indipendently. An extraction won't + contain any of the symbols appearing in ``exclude``. + """ + raise NotImplementedError + + def _lookup_key(self, c): + """ + Create a key for the given Cluster. Clusters with same key may be + processed together in the search for CIRE candidates. Clusters should + have a different key if they must not be processed together, e.g., + when this would lead to violation of data dependencies. + """ + raise NotImplementedError + + def _select(self, variants): + """ + Select the best variant out of a set of variants, weighing flops + and working set. + """ + raise NotImplementedError + + def _eval_variants_delta(self, delta_flops, delta_ws): + """ + Given the deltas in flop reduction and working set size increase of two + Variants, return True if the second variant is estimated to deliever + better performance, False otherwise. + """ + raise NotImplementedError + + +class CireTransformerLegacy(CireTransformer): + def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): """ Carry out the bulk of the work of ``_generate``. @@ -192,38 +235,8 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): return mapper - def _generate(self, exprs, exclude): - """ - Generate one or more extractions from ``exprs``. An extraction is a - set of CIRE candidates which may be turned into aliases. Two different - extractions may contain overlapping sub-expressions and, therefore, - should be processed and evaluated indipendently. An extraction won't - contain any of the symbols appearing in ``exclude``. - """ - raise NotImplementedError - - def _lookup_key(self, c): - """ - Create a key for the given Cluster. Clusters with same key may be - processed together in the search for CIRE candidates. Clusters should - have a different key if they must not be processed together, e.g., - when this would lead to violation of data dependencies. - """ - raise NotImplementedError - - def _eval_variants_delta(self, delta_flops, delta_ws): - """ - Given the deltas in flop reduction and working set size increase of two - Variants, return True if the second variant is estimated to deliever - better performance, False otherwise. - """ - raise NotImplementedError - - def process(self, clusters): - raise NotImplementedError - -class CireInvariants(CireTransformer, Queue): +class CireInvariants(CireTransformerLegacy, Queue): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -271,6 +284,9 @@ def _lookup_key(self, c, d): return AliasKey(ispace, intervals, c.dtype, c.guards, properties) + def _select(self, variants): + return pick_best(variants, self._eval_variants_delta) + def _eval_variants_delta(self, delta_flops, delta_ws): # Always prefer the Variant with fewer temporaries return ((delta_ws == 0 and delta_flops < 0) or @@ -309,7 +325,7 @@ def _generate(self, exprs, exclude): yield self._do_generate(exprs, exclude, cbk_search) -class CireSops(CireTransformer): +class CireDerivatives(CireTransformerLegacy): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) @@ -340,55 +356,117 @@ def process(self, clusters): return processed def _generate(self, exprs, exclude): - # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` - def cbk_search(expr): - if isinstance(expr, EvalDerivative) and not expr.base.is_Function: - return expr.args - else: - return flatten(e for e in [cbk_search(a) for a in expr.args] if e) - - cbk_compose = lambda e: split_coeff(e)[1] - basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose) + # E.g., extract `u.dx*a*b` and `u.dx*a*c` from + # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` + basextr = self._do_generate(exprs, exclude, self._cbk_search, + self._cbk_compose) if not basextr: return yield basextr # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]` - # That is, attempt extracting the largest common derivative-induced subexprs + # I.e., attempt extracting the largest common derivative-induced subexprs mappers = [deindexify(e) for e in basextr.extracted] counter = Counter(flatten(m.keys() for m in mappers)) groups = as_mapper(counter, key=counter.get) grank = {k: sorted(v, key=lambda e: estimate_cost(e), reverse=True) for k, v in groups.items()} - def cbk_search2(expr, rank): - ret = [] - for e in cbk_search(expr): - mapper = deindexify(e) - for i in rank: - if i in mapper: - ret.extend(mapper[i]) - break - return ret - candidates = sorted(grank, reverse=True)[:2] for i in candidates: lower_pri_elems = flatten([grank[j] for j in candidates if j != i]) - cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems) - yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose) + cbk_search = lambda e: self._cbk_search2(e, grank[i] + lower_pri_elems) + yield self._do_generate(exprs, exclude, cbk_search, self._cbk_compose) def _lookup_key(self, c): return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties) + def _select(self, variants): + if isinstance(self.opt_schedule_strategy, int): + try: + return variants[self.opt_schedule_strategy] + except IndexError: + raise ValueError("Illegal schedule %d; " + "generated %d schedules in total" + % (self.opt_schedule_strategy, len(variants))) + + return pick_best(variants, self._eval_variants_delta) + def _eval_variants_delta(self, delta_flops, delta_ws): # If there's a greater flop reduction using fewer temporaries, no doubts # what's gonna be the best variant. But if the better flop reduction # comes at the price of using more temporaries, then we have to apply - # heuristics, in particular we estimate how many flops would a temporary + # heuristics, in particular we estimate how many flops a temporary would # allow to save - return ((delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or - (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or - (delta_flops <= 0 and delta_ws >= 0)) + return ( + (delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or + (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or + (delta_flops <= 0 and delta_ws >= 0) + ) + + +class CireEvalDerivatives(CireDerivatives): + + def _cbk_compose(self, expr): + return split_coeff(expr)[1] + + def _cbk_search(self, expr): + if isinstance(expr, EvalDerivative) and not expr.base.is_Function: + return expr.args + else: + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) + + def _cbk_search2(self, expr, rank): + ret = [] + for e in self._cbk_search(expr): + mapper = deindexify(e) + for i in rank: + if i in mapper: + ret.extend(mapper[i]) + break + return ret + + +class CireIndexDerivatives(CireDerivatives): + + def _cbk_compose(self, expr): + if expr.is_Pow: + return (expr,) + terms = [] + for a in expr.args: + try: + if not isinstance(a.function, Weights): + terms.append(a) + except AttributeError: + terms.append(a) + return tuple(terms) + + def _cbk_search(self, expr): + if isinstance(expr, IndexDerivative): + return (expr.expr,) + else: + return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e) + + def _cbk_search2(self, expr, rank): + ret = [] + for e in self._cbk_search(expr): + mapper = deindexify(e) + for i in rank: + found = [v.expr if isinstance(v, IndexDerivative) else v + for v in mapper.get(i, [])] + if found: + ret.extend(found) + break + return ret + + +# Subpass mapper +modes = { + 'invariants': [CireInvariantsElementary, + CireInvariantsDivs], + 'eval-derivs': [CireEvalDerivatives], # NOTE: legacy pass + 'index-derivs': [CireIndexDerivatives], +} def collect(extracted, ispace, minstorage): @@ -438,28 +516,9 @@ def collect(extracted, ispace, minstorage): for expr in extracted: assert not expr.is_Equality - indexeds = retrieve_indexed(expr) - - bases = [] - offsets = [] - for i in indexeds: - ii = IterationInstance(i) - if ii.is_irregular: - break - - base = [] - offset = [] - for e, ai in zip(ii, ii.aindices): - if q_constant(e): - base.append(e) - else: - base.append(ai) - offset.append((ai, e - ai)) - bases.append(tuple(base)) - offsets.append(LabeledVector(offset)) - - if not indexeds or len(bases) == len(indexeds): - found.append(Candidate(expr, ispace, indexeds, bases, offsets)) + eg = ExprGeometry(expr) + if eg.is_regular: + found.append(eg) # Create groups of aliasing expressions mapper = OrderedDict() @@ -468,17 +527,13 @@ def collect(extracted, ispace, minstorage): c = unseen.pop(0) group = [c] for u in list(unseen): - # Is the arithmetic structure of `c` and `u` equivalent ? - if not compare_ops(c.expr, u.expr): - continue - # Is `c` translated w.r.t. `u` ? if not c.translated(u): continue group.append(u) unseen.remove(u) - group = Group(group) + group = Group(group, ispace=ispace) if minstorage: k = group.dimensions_translated @@ -561,8 +616,7 @@ def collect(extracted, ispace, minstorage): na = g.naliases nr = nredundants(ispace, pivot) score = estimate_cost(pivot, True)*((na - 1) + nr) - if score > 0: - aliases.add(pivot, aliaseds, list(mapper), distances, score) + aliases.add(pivot, aliaseds, list(mapper), distances, score) return aliases @@ -575,17 +629,17 @@ def choose(aliases, exprs, mapper, mingain): """ aliases = AliasList(aliases) - # `score < m` => discarded + if not aliases: + return exprs, aliases + + # `score <= m` => discarded # `score > M` => optimized - # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics + # `m <= score <= M` => maybe discarded, maybe optimized; depends on heuristics m = mingain M = mingain*3 - if not aliases: - return exprs, aliases - # Filter off the aliases with low score - key = lambda a: a.score >= m + key = lambda a: a.score > m aliases.filter(key) # Project the candidate aliases into `exprs` to derive the final working set @@ -596,7 +650,7 @@ def choose(aliases, exprs, mapper, mingain): # Filter off the aliases with a weak flop-reduction / working-set tradeoff key = lambda a: \ a.score > M or \ - m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) + m < a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) aliases.filter(key) if not aliases: @@ -688,8 +742,9 @@ def lower_aliases(aliases, meta, maxpar): # Given the iteration `interval`, lower distances to indices for distance, indices in zip(a.distances, indicess): + v = distance[interval.dim] or 0 try: - indices.append(d - interval.lower + distance[interval.dim]) + indices.append(d - interval.lower + v) except TypeError: indices.append(d) @@ -826,7 +881,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries - make = Array + make = TempArray clusters = [] subs = {} @@ -927,29 +982,23 @@ def optimize_clusters_msds(clusters): return processed -def pick_best(variants, schedule_strategy, eval_variants_delta): +def pick_best(variants, eval_variants_delta): """ Return the variant with the best trade-off between operation count reduction and working set increase. Heuristics may be applied. """ - if type(schedule_strategy) is int: - try: - return variants[schedule_strategy] - except IndexError: - raise ValueError("Illegal schedule strategy %d; accepted `[0, %d]`" - % (schedule_strategy, len(variants) - 1)) - best = None best_flops_score = None best_ws_score = None for i in variants: - i_flops_score = sum(sa.score for sa in i.schedule) + i_flops_score = i.schedule.score # The working set score depends on the number and dimensionality of # temporaries required by the Schedule i_ws_count = Counter([len(sa.writeto) for sa in i.schedule]) - i_ws_score = tuple(i_ws_count[sa + 1] for sa in reversed(range(max(i_ws_count)))) + i_ws_score = [i_ws_count[sa + 1] + for sa in reversed(range(max(i_ws_count, default=0)))] # TODO: For now, we assume the performance impact of an N-dimensional # temporary is always the same regardless of the value N, but this might # change in the future @@ -970,91 +1019,36 @@ def pick_best(variants, schedule_strategy, eval_variants_delta): # Utilities -class Candidate(object): - - def __init__(self, expr, ispace, indexeds, bases, offsets): - self.expr = expr - self.ispace = ispace - self.indexeds = indexeds - self.bases = bases - self.offsets = offsets - - def __repr__(self): - return "Candidate(expr=%s)" % self.expr - - def translated(self, other): - """ - True if ``self`` is translated w.r.t. ``other``, False otherwise. - - Examples - -------- - Two candidates are translated if their bases are the same and - their offsets are pairwise translated. - - c := A[i,j] op A[i,j+1] -> Toffsets = {i: [0,0], j: [0,1]} - u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]} - - Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}` - """ - if len(self.Toffsets) != len(other.Toffsets): - return False - if len(self.bases) != len(other.bases): - return False - - # Check the bases - if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)): - return False - - # Check the offsets - for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets): - if d0 is not d1: - return False - - distance = set(o0 - o1) - if len(distance) != 1: - return False - - return True - - @cached_property - def Toffsets(self): - return LabeledVector.transpose(*self.offsets) - - @cached_property - def dimensions(self): - return frozenset(i for i, _ in self.Toffsets) - - @property - def shifts(self): - return self.ispace.intervals - - class Group(tuple): """ A collection of aliasing expressions. """ - def __new__(cls, items): + def __new__(cls, items, ispace=None): # Expand out the StencilDimensions, if any processed = [] for c in items: - if not c.expr.find(StencilDimension): + sdims = [d for d in unbounded(c.expr) if d.is_Stencil] + if not sdims: processed.append(c) continue - for f in (sdims_min, sdims_max): + f0 = lambda e: minimum(e, sdims) + f1 = lambda e: maximum(e, sdims) + + for f in (f0, f1): expr = f(c.expr) indexeds = [f(i) for i in c.indexeds] offsets = [LabeledVector([(d, f(i)) for d, i in v.items()]) for v in c.offsets] - processed.append( - Candidate(expr, c.ispace, indexeds, c.bases, offsets) - ) + processed.append(ExprGeometry(expr, indexeds, c.bases, offsets)) obj = super().__new__(cls, processed) obj._items = items + obj._ispace = ispace + return obj def __repr__(self): @@ -1094,6 +1088,8 @@ def diameter(self): ret = defaultdict(int) for i in self.Toffsets: for d, v in i: + if d not in self._ispace: + continue try: distance = int(max(v) - min(v)) except TypeError: @@ -1101,7 +1097,13 @@ def diameter(self): if len(set(v)) == 1: continue else: - raise ValueError + # Worst-case scenario, we raraly end up here + # Resort to the fast vector-based comparison machinery + # (rather than the slower sympy.simplify) + items = [Vector(i) for i in v] + distance, = vmax(*items) - vmin(*items) + if not is_integer(distance): + raise ValueError ret[d] = max(ret[d], distance) return ret @@ -1109,13 +1111,13 @@ def diameter(self): @property def pivot(self): """ - A deterministically chosen Candidate for this Group. + A deterministically chosen reference for this Group. """ return self[0] @property def dimensions(self): - return self.pivot.dimensions + return frozenset(self.diameter) @property def dimensions_translated(self): @@ -1125,10 +1127,11 @@ def dimensions_translated(self): def naliases(self): na = len(self._items) - sdims = set().union(*[c.expr.find(StencilDimension) for c in self._items]) - implicit = int(np.prod([i._size for i in sdims])) - 1 + udims = set().union(*[unbounded(c.expr) for c in self._items]) + sdims = [d for d in udims if d.is_Stencil] + implicit = int(np.prod([i._size for i in sdims])) - return na + implicit + return na*max(implicit, 1) @cached_property def _pivot_legal_rotations(self): @@ -1158,7 +1161,7 @@ def _pivot_legal_rotations(self): def _pivot_min_intervals(self): """ The minimum Interval along each Dimension such that by evaluating the - pivot, all Candidates are evaluated too. + pivot, all other items are evaluated too. """ c = self.pivot @@ -1191,17 +1194,23 @@ def _pivot_legal_shifts(self): ret = defaultdict(lambda: (-np.inf, np.inf)) for i, ofs in zip(c.indexeds, c.offsets): f = i.function + for l in ofs.labels: + if isinstance(l, StencilDimension): + continue + # `f`'s cumulative halo size along `l` hsize = sum(f._size_halo[l]) # Any `ofs`'s shift due to non-[0,0] iteration space - lower, upper = c.shifts[l].offsets + lower, upper = self._ispace.intervals[l].offsets + + ofs0, ofs1 = extrema(ofs[l]) try: # Assume `ofs[l]` is a number (typical case) - maxd = min(0, max(ret[l][0], -ofs[l] - lower)) - mini = max(0, min(ret[l][1], hsize - ofs[l] - upper)) + maxd = min(0, max(ret[l][0], -ofs0 - lower)) + mini = max(0, min(ret[l][1], hsize - ofs1 - upper)) ret[l] = (maxd, mini) except TypeError: @@ -1321,6 +1330,10 @@ def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False): is_frame=is_frame or self.is_frame ) + @property + def score(self): + return sum(i.score for i in self) + def make_rotations_table(d, v): """ diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 741c8bf558..d731fa2126 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -1,11 +1,12 @@ from sympy import sympify +from devito.finite_differences.differentiable import IndexSum from devito.ir.clusters import Queue from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SEQUENTIAL, SKEWABLE, TILABLES, Interval, IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device -from devito.symbolics import uxreplace, xreplace_indices +from devito.symbolics import search, uxreplace, xreplace_indices from devito.tools import UnboundedMultiTuple, as_tuple, flatten, is_integer, prod from devito.types import BlockDimension @@ -100,6 +101,8 @@ def _has_data_reuse(self, cluster): # which translates into the existance of any Relation accross Indexeds if any(r.function.is_AbstractFunction for r in cluster.scope.r_gen()): return True + if search(cluster.exprs, IndexSum): + return True # If it's a reduction operation a la matrix-matrix multiply, two Indexeds # might be enough @@ -314,7 +317,7 @@ def callback(self, clusters, prefix, blk_size_gen=None): # By passing a suitable key to `next` we ensure that we pull the # next par-tile entry iff we're now blocking an unseen TILABLE nest try: - step = sympify(blk_size_gen.next(clusters, d)) + step = sympify(blk_size_gen.next(prefix, d, clusters)) except StopIteration: return clusters else: @@ -418,6 +421,7 @@ class BlockSizeGenerator(object): def __init__(self, par_tile): self.umt = UnboundedMultiTuple(*par_tile) + self.tip = -1 # This is for Clusters that need a small par-tile to avoid under-utilizing # computational resources (e.g., kernels running over iteration spaces that @@ -430,15 +434,45 @@ def __init__(self, par_tile): else: self.umt_small = UnboundedMultiTuple(par_tile.default) - def next(self, clusters, d): + def next(self, prefix, d, clusters): + # If a whole new set of Dimensions, move the tip -- this means `clusters` + # at `d` represents a new loop nest or kernel + x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)) + if not x: + self.tip += 1 + # TODO: This is for now exceptionally rudimentary if all(c.properties.is_blockable_small(d) for c in clusters): - umt = self.umt_small + if not x: + self.umt_small.iter() + return self.umt_small.next() + + if x: + item = self.umt.curitem else: + # We can't `self.umt.iter()` because we might still want to + # fallback to `self.umt_small` + item = self.umt.nextitem + + # Handle user-provided rules + # TODO: This is also rudimentary + if item.rule is None: umt = self.umt + elif is_integer(item.rule): + if item.rule == self.tip: + umt = self.umt + else: + umt = self.umt_small + else: + if item.rule in {d.name for d in prefix.itdims}: + umt = self.umt + else: + # This is like "pattern unmatched" -- fallback to `umt_small` + umt = self.umt_small - if not any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)): + if not x: umt.iter() + return umt.next() diff --git a/devito/passes/clusters/cse.py b/devito/passes/clusters/cse.py index 85b4e379df..5f00a0341e 100644 --- a/devito/passes/clusters/cse.py +++ b/devito/passes/clusters/cse.py @@ -8,6 +8,7 @@ from devito.passes.clusters.utils import makeit_ssa from devito.symbolics import estimate_cost, q_leaf from devito.symbolics.manipulation import _uxreplace +from devito.tools import as_list from devito.types import Eq, Temp as Temp0 __all__ = ['cse'] @@ -52,13 +53,18 @@ def _cse(maybe_exprs, make, min_cost=1, mode='default'): # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM - # Just for flexibility, accept either Clusters or exprs + # Accept Clusters, Eqs or even just exprs if isinstance(maybe_exprs, Cluster): processed = list(maybe_exprs.exprs) scope = maybe_exprs.scope else: - processed = list(maybe_exprs) - scope = Scope(maybe_exprs) + maybe_exprs = as_list(maybe_exprs) + if all(e.is_Equality for e in maybe_exprs): + processed = maybe_exprs + scope = Scope(maybe_exprs) + else: + processed = [Eq(make(), e) for e in maybe_exprs] + scope = Scope([]) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index 00e7526f88..fd0d8bdcad 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -1,5 +1,5 @@ from devito.finite_differences import IndexDerivative -from devito.ir import Interval, IterationSpace, Queue +from devito.ir import Backward, Forward, Interval, IterationSpace, Queue from devito.passes.clusters.misc import fuse from devito.symbolics import (retrieve_dimensions, reuse_if_untouched, q_leaf, uxreplace) @@ -87,7 +87,8 @@ def _core(expr, c, weights, mapper, sregistry): dims = tuple(d for d in dims if d not in c.ispace) intervals = [Interval(d) for d in dims] - ispace0 = IterationSpace(intervals) + directions = {d: Backward if d.backward else Forward for d in dims} + ispace0 = IterationSpace(intervals, directions=directions) extra = (c.ispace.itdimensions + dims,) ispace = IterationSpace.union(c.ispace, ispace0, relations=extra) @@ -100,7 +101,9 @@ def _core(expr, c, weights, mapper, sregistry): # Transform e.g. `w[i0] -> w[i0 + 2]` for alignment with the # StencilDimensions starting points - subs = {expr.weights: expr.weights.subs(d, d - d._min) for d in dims} + subs = {expr.weights: + expr.weights.subs(d, d - (d._max if d.backward else d._min)) + for d in dims} expr1 = Inc(s, uxreplace(expr.expr, subs)) processed.append(c.rebuild(exprs=expr1, ispace=ispace)) diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py index f023762402..1353bf89b3 100644 --- a/devito/passes/clusters/factorization.py +++ b/devito/passes/clusters/factorization.py @@ -142,13 +142,16 @@ def run(expr): terms.append(i) # Collect common funcs - w_funcs = Add(*w_funcs, evaluate=False) - w_funcs = collect(w_funcs, funcs, evaluate=False) - try: - terms.extend([Mul(k, collect_const(v), evaluate=False) - for k, v in w_funcs.items()]) - except AttributeError: - assert w_funcs == 0 + if len(w_funcs) > 1: + w_funcs = Add(*w_funcs, evaluate=False) + w_funcs = collect(w_funcs, funcs, evaluate=False) + try: + terms.extend([Mul(k, collect_const(v), evaluate=False) + for k, v in w_funcs.items()]) + except AttributeError: + assert w_funcs == 0 + else: + terms.extend(w_funcs) # Collect common pows w_pows = Add(*w_pows, evaluate=False) diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py index 9d0e7463bd..3596bed4e0 100644 --- a/devito/passes/clusters/misc.py +++ b/devito/passes/clusters/misc.py @@ -180,7 +180,7 @@ def _key(self, c): if isinstance(c, Cluster): syncs = (c.syncs,) else: - syncs = c.syncs + syncs = tuple(i.syncs for i in c) for i in syncs: mapper = defaultdict(set) for k, v in i.items(): @@ -301,7 +301,7 @@ def choose_element(queue, scheduled): return ClusterGroup(dag.topological_sort(choose_element), prefix) - def _build_dag(self, cgroups, prefix, peeking=False): + def _build_dag(self, cgroups, prefix): """ A DAG representing the data dependences across the ClusterGroups within a given scope. @@ -348,9 +348,6 @@ def is_cross(dep): elif any(scope.d_output_gen()): dag.add_edge(cg0, cg1) - if peeking and dag.edges: - return dag - return dag diff --git a/devito/passes/equations/linearity.py b/devito/passes/equations/linearity.py index 0950e1126f..66390f821e 100644 --- a/devito/passes/equations/linearity.py +++ b/devito/passes/equations/linearity.py @@ -23,10 +23,16 @@ def collect_derivatives(expressions): mapper = inspect(e) # E.g., 0.2*u.dx -> (0.2*u).dx - ep = aggregate_coeffs(e, mapper) + e1 = aggregate_coeffs(e, mapper) # E.g., (0.2*u).dx + (0.3*v).dx -> (0.2*u + 0.3*v).dx - processed.append(factorize_derivatives(ep)) + e2 = factorize_derivatives(e1) + if e2 == e1: + # No luck, stick to `e` to preserve e.g. the original + # coefficient factorization + processed.append(e) + else: + processed.append(e2) return processed diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index db9b446d7f..b844fd9a08 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -1,10 +1,12 @@ -from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Section, - TimedList, Transformer) +from itertools import groupby + +from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Iteration, + Section, TimedList, Transformer) from devito.mpi.routines import (HaloUpdateCall, HaloWaitCall, MPICall, MPIList, HaloUpdateList, HaloWaitList, RemainderCall, ComputeCall) from devito.passes.iet.engine import iet_pass -from devito.types import Timer +from devito.types import TempArray, TempFunction, Timer __all__ = ['instrument'] @@ -25,10 +27,12 @@ def instrument(graph, **kwargs): @iet_pass def track_subsections(iet, **kwargs): """ - Add custom Sections to the `profiler`. Custom Sections include: + Add sub-Sections to the `profiler`. Sub-Sections include: * MPI Calls (e.g., HaloUpdateCall and HaloUpdateWait) * Busy-waiting on While(lock) (e.g., from host-device orchestration) + * Multi-pass implementations -- one sub-Section for each pass, within one + macro Section """ profiler = kwargs['profiler'] sregistry = kwargs['sregistry'] @@ -45,6 +49,7 @@ def track_subsections(iet, **kwargs): mapper = {} + # MPI Calls, busy-waiting for NodeType in [MPIList, MPICall, BusyWait, ComputeCall]: for k, v in MapNodes(Section, NodeType).visit(iet).items(): for i in v: @@ -57,6 +62,37 @@ def track_subsections(iet, **kwargs): iet = Transformer(mapper).visit(iet) + # Multi-pass implementations + mapper = {} + + for i in FindNodes(Iteration).visit(iet): + for k, g in groupby(i.nodes, key=lambda n: n.is_Section): + if not k: + continue + + candidates = [] + for i in g: + functions = FindSymbols().visit(i) + if any(isinstance(f, (TempArray, TempFunction)) for f in functions): + candidates.append(i) + else: + # They must be consecutive Sections + break + if len(candidates) < 2: + continue + + name = sregistry.make_name(prefix='multipass') + body = [i._rebuild(is_subsection=True) for i in candidates] + section = Section(name, body=body) + + profiler.group_as_subsections(name, [i.name for i in candidates]) + + mapper[candidates.pop(0)] = section + for i in candidates: + mapper[i] = None + + iet = Transformer(mapper).visit(iet) + return iet, {} @@ -100,6 +136,6 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): if runs_async and not unnecessary: mapper[tl] = tl._rebuild(body=tl.body + (sync,)) - iet = Transformer(mapper).visit(iet) + iet = Transformer(mapper, nested=True).visit(iet) return iet, {} diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 4199283e54..e83aeae943 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -5,11 +5,11 @@ from devito.finite_differences import Max, Min from devito.ir import (Any, Forward, Iteration, List, Prodder, FindApplications, - FindNodes, Transformer, Uxreplace, filter_iterations, - retrieve_iteration_tree) + FindNodes, FindSymbols, Transformer, Uxreplace, + filter_iterations, retrieve_iteration_tree) from devito.passes.iet.engine import iet_pass from devito.symbolics import evalrel, has_integer_args -from devito.tools import as_mapper, split +from devito.tools import as_mapper, filter_ordered, split __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'generate_macros', 'minimize_symbols'] @@ -178,23 +178,29 @@ def minimize_symbols(iet): def remove_redundant_moddims(iet): - subs0 = {} - subs1 = {} - for n in FindNodes(Iteration).visit(iet): - mds = [d for d in n.uindices - if d.is_Modulo and d.origin is not None] - if not mds: - continue + key = lambda d: d.is_Modulo and d.origin is not None + mds = [d for d in FindSymbols('dimensions').visit(iet) if key(d)] + if not mds: + return iet - mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - for k, v in mapper.items(): - chosen = v.pop(0) - subs0.update({d: chosen for d in v}) + mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo) - uindices = [d for d in n.uindices if d not in subs0] - subs1[n] = n._rebuild(uindices=uindices) + subs = {} + for k, v in mapper.items(): + chosen = v.pop(0) + subs.update({d: chosen for d in v}) + + body = Uxreplace(subs).visit(iet.body) + iet = iet._rebuild(body=body) + + # ModuloDimensions are defined in Iteration headers, hence they must be + # removed from there too + subs = {} + for n in FindNodes(Iteration).visit(iet): + if not set(n.uindices) & set(mds): + continue + subs[n] = n._rebuild(uindices=filter_ordered(n.uindices)) - iet = Transformer(subs1, nested=True).visit(iet) - iet = Uxreplace(subs0).visit(iet) + iet = Transformer(subs, nested=True).visit(iet) return iet diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index c47ef95bfc..af362af96a 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -225,10 +225,13 @@ def _print_TrigonometricFunction(self, expr): func_name += 'f' return '%s(%s)' % (func_name, self._print(*expr.args)) + def _print_DefFunction(self, expr): + arguments = [self._print(i) for i in expr.arguments] + return "%s(%s)" % (expr.name, ','.join(arguments)) + def _print_Fallback(self, expr): return expr.__str__() - _print_DefFunction = _print_Fallback _print_MacroArgument = _print_Fallback _print_IndexedBase = _print_Fallback _print_IndexSum = _print_Fallback diff --git a/devito/symbolics/queries.py b/devito/symbolics/queries.py index ec86ae7809..c4002508cb 100644 --- a/devito/symbolics/queries.py +++ b/devito/symbolics/queries.py @@ -20,7 +20,8 @@ # * Number # * Symbol # * Indexed -extra_leaves = (FieldFromPointer, FieldFromComposite, IndexedBase, AbstractObject) +extra_leaves = (FieldFromPointer, FieldFromComposite, IndexedBase, AbstractObject, + IndexedPointer) def q_symbol(expr): @@ -31,7 +32,9 @@ def q_symbol(expr): def q_leaf(expr): - return expr.is_Atom or expr.is_Indexed or isinstance(expr, extra_leaves) + return (expr.is_Atom or + expr.is_Indexed or + isinstance(expr, extra_leaves)) def q_indexed(expr): @@ -51,7 +54,7 @@ def q_derivative(expr): def q_terminal(expr): return (expr.is_Symbol or expr.is_Indexed or - isinstance(expr, extra_leaves + (IndexedPointer,))) + isinstance(expr, extra_leaves)) def q_routine(expr): diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py index 94e533a692..7680bd462a 100644 --- a/devito/symbolics/search.py +++ b/devito/symbolics/search.py @@ -1,3 +1,5 @@ +import sympy + from devito.symbolics.queries import (q_indexed, q_function, q_terminal, q_leaf, q_symbol, q_dimension, q_derivative) from devito.tools import as_tuple @@ -48,7 +50,7 @@ def __init__(self, query, mode, deep=False): self.deep = deep def _next(self, expr): - if self.deep is True and expr.is_Indexed: + if self.deep and expr.is_Indexed: return expr.indices elif q_leaf(expr): return () @@ -110,10 +112,18 @@ def search(exprs, query, mode='unique', visit='dfs', deep=False): assert mode in Search.modes, "Unknown mode" - searcher = Search(query, mode, deep) + if isinstance(query, type): + Q = lambda obj: isinstance(obj, query) + else: + Q = query + + searcher = Search(Q, mode, deep) found = Search.modes[mode]() for e in as_tuple(exprs): + if not isinstance(e, sympy.Basic): + continue + if visit == 'dfs': found.update(searcher.dfs(e)) elif visit == 'bfs': diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index 10a1d90672..218e1c6c98 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -29,6 +29,13 @@ class Bunch(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) + def __repr__(self): + return "Bunch(%s)" % ", ".join(["%s=%s" % i for i in self.__dict__.items()]) + + def __iter__(self): + for i in self.__dict__.values(): + yield i + class EnrichedTuple(tuple, Pickable): @@ -319,11 +326,13 @@ class DAG(object): https://github.com/thieman/py-dag/ """ - def __init__(self, nodes=None, edges=None): + def __init__(self, nodes=None, edges=None, labels=None): self.graph = OrderedDict() self.labels = DefaultOrderedDict(dict) + for node in as_tuple(nodes): self.add_node(node) + for i in as_tuple(edges): try: ind_node, dep_node = i @@ -332,6 +341,11 @@ def __init__(self, nodes=None, edges=None): self.labels[ind_node][dep_node] = label self.add_edge(ind_node, dep_node) + for ind_node, i in (labels or {}).items(): + for dep_node, v in i.items(): + if dep_node in self.graph.get(ind_node, []): + self.labels[ind_node][dep_node] = v + def __contains__(self, key): return key in self.graph @@ -339,6 +353,10 @@ def __contains__(self, key): def nodes(self): return tuple(self.graph) + @property + def roots(self): + return tuple(n for n in self.nodes if not self.predecessors(n)) + @property def edges(self): ret = [] @@ -350,6 +368,9 @@ def edges(self): def size(self): return len(self.graph) + def clone(self): + return self.__class__(self.nodes, self.edges, self.labels) + def add_node(self, node_name, ignore_existing=False): """Add a node if it does not exist yet, or error out.""" if node_name in self.graph: @@ -398,6 +419,24 @@ def predecessors(self, node): """Return a list of all predecessors of the given node.""" return [key for key in self.graph if node in self.graph[key]] + def all_predecessors(self, node): + """ + Return a list of all nodes ultimately predecessors of the given node + in the dependency graph, in topological order. + """ + found = set() + + def _all_predecessors(n): + if n in found: + return + found.add(n) + for predecessor in self.predecessors(n): + _all_predecessors(predecessor) + + _all_predecessors(node) + + return found + def downstream(self, node): """Return a list of all nodes this node has edges towards.""" if node not in self.graph: @@ -493,6 +532,28 @@ def connected_components(self, enumerated=False): else: return tuple(groups) + def find_paths(self, node): + if node not in self.graph: + raise KeyError('node %s is not in graph' % node) + + paths = [] + + def dfs(node, path): + path.append(node) + + if not self.graph[node]: + paths.append(tuple(path)) + else: + for child in self.graph[node]: + dfs(child, path) + + # Remove the node from the path to backtrack + path.pop() + + dfs(node, []) + + return tuple(paths) + class frozendict(Mapping): """ @@ -575,7 +636,11 @@ def __init__(self, *items): nitems = [] for i in as_tuple(items): if isinstance(i, Iterable): - nitems.append(tuple(i)) + if isinstance(i, tuple): + # Honours tuple subclasses + nitems.append(i) + else: + nitems.append(tuple(i)) else: raise ValueError("Expected sequence, got %s" % type(i)) @@ -589,6 +654,17 @@ def __repr__(self): items[self.tip] = "*%s" % items[self.tip] return "%s(%s)" % (self.__class__.__name__, ", ".join(items)) + @property + def curitem(self): + return self.items[self.tip] + + @property + def nextitem(self): + return self.items[min(self.tip + 1, max(len(self.items) - 1, 0))] + + def index(self, item): + return self.items.index(item) + def iter(self): if not self.items: raise ValueError("No tuples available") diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py index 9793904ac2..7a4074b370 100644 --- a/devito/tools/dtypes_lowering.py +++ b/devito/tools/dtypes_lowering.py @@ -75,12 +75,15 @@ def add_dtype(self, field_name, count): self.update(build_dtypes_vector([field_name], [count])) - def get_base_dtype(self, v): + def get_base_dtype(self, v, default=None): for (base_dtype, count), dtype in self.items(): if dtype is v: return base_dtype - raise ValueError + if default is not None: + return default + else: + raise ValueError dtypes_vector_mapper = DTypesVectorMapper() @@ -254,8 +257,10 @@ def infer_dtype(dtypes): highest precision; * If there's at least one floating dtype, ignore any integer dtypes. """ - fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} + # Resolve the vector types, if any + dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes} + fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} if len(fdtypes) > 1: return max(fdtypes, key=lambda i: np.dtype(i).itemsize) elif len(fdtypes) == 1: diff --git a/devito/types/array.py b/devito/types/array.py index f12922f330..f7113c19b1 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -11,7 +11,8 @@ from devito.types.basic import AbstractFunction from devito.types.utils import CtypesFactory, DimensionTuple -__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle'] +__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle', + 'ComponentAccess'] class ArrayBasic(AbstractFunction): diff --git a/devito/types/basic.py b/devito/types/basic.py index d7a422b39e..cc73bda6cf 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1462,3 +1462,17 @@ def compare(self, other): if c: return c return 0 + + def _subs(self, old, new, **hints): + # Wrap in a try to make sure no substitution happens when + # old is an Indexed as only checkink `old is new` would lead to + # incorrect substitution of `old.base` by `new` + try: + if old.is_Indexed: + if old.base == self.base and old.indices == self.indices: + return new + else: + return self + except AttributeError: + pass + return super()._subs(old, new, **hints) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 6626f09517..620c23e56a 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1347,8 +1347,10 @@ class StencilDimension(BasicDimension): is_Stencil = True __rargs__ = BasicDimension.__rargs__ + ('_min', '_max') + __rkwargs__ = BasicDimension.__rkwargs__ + ('backward',) - def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): + def __init_finalize__(self, name, _min, _max, spacing=None, backward=False, + **kwargs): self._spacing = sympy.sympify(spacing) or sympy.S.One if not is_integer(_min): @@ -1360,13 +1362,21 @@ def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs): self._min = _min self._max = _max + self._size = _max - _min + 1 + self._backward = backward + if self._size < 1: raise ValueError("Expected size greater than 0 (got %s)" % self._size) def _hashable_content(self): - return super()._hashable_content() + (self._min, self._max, self._spacing) + return (super()._hashable_content() + + (self._min, self._max, self._spacing, self._backward)) + + @cached_property + def backward(self): + return self._backward @cached_property def symbolic_size(self): @@ -1513,7 +1523,7 @@ class AffineIndexAccessFunction(IndexAccessFunction): * the "main" Dimension (in practice a SpaceDimension or a TimeDimension); * an offset (number or symbolic expression, of dtype integer). - * a StencilDimension; + * one or more StencilDimensions; Examples -------- @@ -1522,39 +1532,47 @@ class AffineIndexAccessFunction(IndexAccessFunction): """ def __new__(cls, *args, **kwargs): + # `args` may contain arbitrarily complicated expressions, so first of all + # we let SymPy simplify it, then we process the args and see if the + # resulting expression is indeed an AffineIndexAccessFunction + add = sympy.Add(*args, **kwargs) + if not isinstance(add, sympy.Add): + # E.g., reduced to a Symbol + return add + d = 0 - sd = 0 + sds = [] ofs_items = [] - for a in args: + for a in add.args: if isinstance(a, StencilDimension): - if sd != 0: - return sympy.Add(*args, **kwargs) - sd = a + sds.append(a) elif isinstance(a, Dimension): d = cls._separate_dims(d, a, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) + return add elif isinstance(a, AffineIndexAccessFunction): - if sd != 0 and a.sd != 0: - return sympy.Add(*args, **kwargs) + if sds and a.sds: + return add d = cls._separate_dims(d, a.d, ofs_items) if d is None: - return sympy.Add(*args, **kwargs) - sd = a.sd + return add + sds = list(a.sds or sds) ofs_items.append(a.ofs) else: ofs_items.append(a) ofs = sympy.Add(*[i for i in ofs_items if i is not None]) if not all(is_integer(i) or i.is_Symbol for i in ofs.free_symbols): - return sympy.Add(*args, **kwargs) + return add + + sds = tuple(sds) - obj = IndexAccessFunction.__new__(cls, d, ofs, sd) + obj = IndexAccessFunction.__new__(cls, d, ofs, *sds) if isinstance(obj, AffineIndexAccessFunction): obj.d = d obj.ofs = ofs - obj.sd = sd + obj.sds = sds else: # E.g., SymPy simplified it to Zero or something else pass @@ -1564,7 +1582,7 @@ def __new__(cls, *args, **kwargs): @classmethod def _separate_dims(cls, d0, d1, ofs_items): if d0 == 0 and d1 == 0: - return None + return 0 elif d0 == 0 and isinstance(d1, Dimension): return d1 elif d1 == 0 and isinstance(d0, Dimension): diff --git a/devito/types/misc.py b/devito/types/misc.py index c644a60614..ccbd9b7fdc 100644 --- a/devito/types/misc.py +++ b/devito/types/misc.py @@ -3,12 +3,12 @@ import numpy as np from sympy.core.core import ordering_of_classes -from devito.types import CompositeObject, Indexed, Symbol +from devito.types import Array, CompositeObject, Indexed, Symbol from devito.types.basic import IndexedData from devito.tools import Pickable, as_tuple __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard', - 'Global', 'Hyperplane', 'Indirection', 'Temp', 'Jump'] + 'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump'] class Timer(CompositeObject): @@ -175,8 +175,8 @@ def __new__(cls, name=None, mapped=None, dtype=np.uint64, is_const=True, class Temp(Symbol): """ - A Temp is a Symbol used by compiler passes to store locally-constructed - temporary expressions. + A Temp is a Symbol used by compiler passes to store intermediate + sub-expressions. """ # Just make sure the SymPy args ordering is the same regardless of whether @@ -184,6 +184,16 @@ class Temp(Symbol): ordering_of_classes.insert(ordering_of_classes.index('Symbol') + 1, 'Temp') +class TempArray(Array): + + """ + A TempArray is an Array used by compiler passes to store intermediate + sub-expressions. + """ + + pass + + class Jump(object): """ diff --git a/devito/types/object.py b/devito/types/object.py index d5d20d65d9..1db973cce9 100644 --- a/devito/types/object.py +++ b/devito/types/object.py @@ -53,6 +53,8 @@ def __repr__(self): def _sympystr(self, printer): return str(self) + _ccode = _sympystr + def _hashable_content(self): return (self.name, self.dtype) diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb index a3f01a55a4..1b4e29ddb9 100644 --- a/examples/performance/01_gpu.ipynb +++ b/examples/performance/01_gpu.ipynb @@ -253,7 +253,7 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define uL0(time, x, y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", + "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", diff --git a/tests/test_derivatives.py b/tests/test_derivatives.py index 123975d63d..8031932df5 100644 --- a/tests/test_derivatives.py +++ b/tests/test_derivatives.py @@ -457,6 +457,16 @@ def test_all_shortcuts(self, so): for fd in g._fd: assert getattr(g, fd) + def test_transpose_simple(self): + grid = Grid(shape=(4, 4)) + + f = TimeFunction(name='f', grid=grid, space_order=4) + + assert str(f.dx.T.evaluate) == ("-0.0833333333*f(t, x - 2*h_x, y)/h_x " + "+ 0.666666667*f(t, x - h_x, y)/h_x " + "- 0.666666667*f(t, x + h_x, y)/h_x " + "+ 0.0833333333*f(t, x + 2*h_x, y)/h_x") + @pytest.mark.parametrize('so', [2, 4, 8, 12]) @pytest.mark.parametrize('ndim', [1, 2]) @pytest.mark.parametrize('derivative, adjoint_name', [ @@ -743,6 +753,19 @@ def test_dxdy_v2(self): term1 = f.dxdy._evaluate(expand=False) assert len(term1.find(IndexDerivative)) == 2 + def test_transpose(self): + grid = Grid(shape=(4, 4)) + x, _ = grid.dimensions + h_x = x.spacing + + f = TimeFunction(name='f', grid=grid, space_order=4) + + term = f.dx.T._evaluate(expand=False) + assert isinstance(term, IndexDerivative) + + i0, = term.dimensions + assert term.base == f.subs(x, x + i0*h_x) + def bypass_uneval(expr): unevals = expr.find(EvalDerivative) diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 7bf28faa35..701995e443 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -28,7 +28,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () s0 = Symbol(name='s0', dtype=np.int32) s1 = Symbol(name='s1', dtype=np.int32) @@ -38,7 +38,7 @@ def test_basic(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == s0 + s1 + 1 - assert expr.sd == 0 + assert expr.sds == () def test_reversed(self): d = Dimension(name='x') @@ -48,14 +48,14 @@ def test_reversed(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs == 1 - assert expr.sd == 0 + assert expr.sds == () expr = d.symbolic_max + d assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d assert expr.ofs is d.symbolic_max - assert expr.sd == 0 + assert expr.sds == () def test_non_affine(self): grid = Grid(shape=(3,)) @@ -76,8 +76,8 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + assert expr.sds == (sd,) s = Symbol(name='s') @@ -85,8 +85,36 @@ def test_stencil_dim(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == 1 + s + assert expr.sds == (sd,) + + expr = sd + 1 + d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd,) + + def test_stencil_dim_multiple(self): + d = Dimension(name='x') + sd0 = StencilDimension('i0', 0, 1) + sd1 = StencilDimension('i1', 0, 1) + + expr = d + sd0 + sd1 + 1 + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + assert expr.sds == (sd0, sd1) + + s = Symbol(name='s') + + expr = sd0 + d + sd1 + 1 + s + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d is d + assert expr.ofs == 1 + s + assert expr.sds == (sd0, sd1) def test_sub(self): d = Dimension(name='x') @@ -104,8 +132,15 @@ def test_sub(self): assert isinstance(expr, AffineIndexAccessFunction) assert expr.d is d - assert expr.sd is sd assert expr.ofs == -1 - s + assert expr.sds == (sd,) + + expr = d + 1 + sd - d + + assert isinstance(expr, AffineIndexAccessFunction) + assert expr.d == 0 + assert expr.ofs == 1 + assert expr.sds == (sd,) class TestBufferedDimension(object): diff --git a/tests/test_dle.py b/tests/test_dle.py index cbe19aee7e..7a6bc3496c 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -294,9 +294,9 @@ class TestBlockingParTile(object): ((16, 16, 16), ((16, 16, 16), (16, 16, 16))), ((32, 4, 4), ((4, 4, 32), (4, 4, 32))), (((16, 4, 4), (16, 16, 16)), ((4, 4, 16), (16, 16, 16))), - (((32, 4, 4), 1), ((4, 4, 32), (4, 4, 32))), - (((32, 4, 4), 1, 'tag0'), ((4, 4, 32), (4, 4, 32))), - ((((32, 4, 8), 1, 'tag0'), ((32, 8, 4), 2)), ((8, 4, 32), (4, 8, 32))), + (((32, 4, 4), None), ((4, 4, 32), (4, 4, 32))), + (((32, 4, 4), None, 'tag0'), ((4, 4, 32), (4, 4, 32))), + ((((32, 4, 8), None, 'tag0'), ((32, 8, 4), None)), ((8, 4, 32), (4, 8, 32))), ]) def test_structure(self, par_tile, expected): grid = Grid(shape=(8, 8, 8)) @@ -344,6 +344,62 @@ def test_structure_2p5D(self): assert iters[0].step == par_tile[1] assert iters[1].step == par_tile[0] + def test_custom_rule0(self): + grid = Grid(shape=(8, 8, 8)) + + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1, + # and use default for the rest!" + par_tile = (4, 4, 4) + rule = 1 + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True})) + + # Check generated code. By having specified "1" as rule, we expect the + # given par-tile to be applied to the kernel with id 1 + bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'z2_blk0'}) + root = bns['x1_blk0'] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + + def test_custom_rule1(self): + grid = Grid(shape=(8, 8, 8)) + x, y, z = grid.dimensions + + f = Function(name='f', grid=grid) + u = TimeFunction(name="u", grid=grid, space_order=4) + v = TimeFunction(name="v", grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])), + Eq(v.forward, u.forward.dx)] + + # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within + # the time loop, and use default for the rest!" + par_tile = (4, 4, 4) + rule = grid.time_dim.name # We must be able to infer it from str + + op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), + 'blockinner': True, + 'blockrelax': True})) + + # Check generated code. By having specified "1" as rule, we expect the + # given par-tile to be applied to the kernel with id 1 + bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'x2_blk0', 'x3_blk0'}) + for i in ['x1_blk0', 'x2_blk0', 'x3_blk0']: + root = bns[i] + iters = FindNodes(Iteration).visit(root) + iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] + assert len(iters) == 3 + assert all(i.step == j for i, j in zip(iters, par_tile)) + @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)]) @pytest.mark.parametrize("time_order", [2]) @@ -1028,6 +1084,7 @@ def test_multiple_subnests_v0(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'par-nested': 0, 'par-collapse-ncores': 1, 'par-dynamic-work': 0})) @@ -1063,6 +1120,7 @@ def test_multiple_subnests_v1(self): _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.) op = Operator(eqn, opt=('advanced', {'openmp': True, 'cire-mingain': 0, + 'cire-schedule': 1, 'cire-rotate': True, 'par-nested': 0, 'par-collapse-ncores': 1, diff --git a/tests/test_dse.py b/tests/test_dse.py index b346e00092..9473068cbc 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -2,21 +2,27 @@ import pytest from cached_property import cached_property +from sympy import Mul # noqa + from conftest import (skipif, EVAL, _R, assert_structure, assert_blocking, # noqa get_params, get_arrays, check_array) -from devito import (NODE, Eq, Inc, Constant, Function, TimeFunction, SparseTimeFunction, # noqa - Dimension, SubDimension, ConditionalDimension, DefaultDimension, Grid, - Operator, norm, grad, div, dimensions, switchconfig, configuration, - centered, first_derivative, solve, transpose, Abs, cos, sin, sqrt) +from devito import (NODE, Eq, Inc, Constant, Function, TimeFunction, # noqa + SparseTimeFunction, Dimension, SubDimension, + ConditionalDimension, DefaultDimension, Grid, Operator, + norm, grad, div, dimensions, switchconfig, configuration, + centered, first_derivative, solve, transpose, Abs, cos, + sin, sqrt) from devito.exceptions import InvalidArgument, InvalidOperator from devito.finite_differences.differentiable import diffify from devito.ir import (Conditional, DummyEq, Expression, Iteration, FindNodes, FindSymbols, ParallelIteration, retrieve_iteration_tree) from devito.passes.clusters.aliases import collect +from devito.passes.clusters.factorization import collect_nested from devito.passes.clusters.cse import Temp, _cse from devito.passes.iet.parpragma import VExpanded from devito.symbolics import (INT, FLOAT, DefFunction, FieldFromPointer, # noqa - Keyword, SizeOf, estimate_cost, pow_to_mul, indexify) + IndexedPointer, Keyword, SizeOf, estimate_cost, + pow_to_mul, indexify) from devito.tools import as_tuple, generator from devito.types import Array, Scalar, Symbol @@ -161,6 +167,9 @@ def test_cse(exprs, expected, min_cost): ('fa[x]**(-s)', 'fa[x]**(-s)'), ('-2/(s**2)', '-2/(s*s)'), ('-fa[x]', '-fa[x]'), + ('Mul(SizeOf("char"), ' + '-IndexedPointer(FieldFromPointer("size", fa._C_symbol), x), evaluate=False)', + 'sizeof(char)*(-fa_vec->size[x])'), ]) def test_pow_to_mul(expr, expected): grid = Grid((4, 5)) @@ -173,6 +182,19 @@ def test_pow_to_mul(expr, expected): assert str(pow_to_mul(eval(expr))) == expected +@pytest.mark.parametrize('expr,expected', [ + ('s - SizeOf("int")*fa[x]', 's - fa[x]*sizeof(int)'), +]) +def test_factorize(expr, expected): + grid = Grid((4, 5)) + x, y = grid.dimensions + + s = Scalar(name='s') # noqa + fa = Function(name='fa', grid=grid, dimensions=(x,), shape=(4,)) # noqa + + assert str(collect_nested(eval(expr))) == expected + + @pytest.mark.parametrize('expr,expected,estimate', [ ('Eq(t0, 3)', 0, False), ('Eq(t0, 4.5)', 0, False), @@ -505,6 +527,7 @@ def test_collection(self, exprs, expected): ispace = exprs[0].ispace aliases = collect(extracted, ispace, False) + aliases.filter(lambda a: a.score > 0) assert len(aliases) == len(expected) assert all(i.pivot in expected for i in aliases) @@ -2211,7 +2234,8 @@ def test_derivatives_from_different_levels(self): eqn = Eq(v.forward, f*(1. + v).dx + 2.*f*((1. + v).dx + f)) - op = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 0})) # Check code generation assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 1 diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index a93d280fc7..30e4c45639 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -845,8 +845,8 @@ def test_tasking_over_compiler_generated(self): assert len(retrieve_iteration_tree(op)) == 5 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) - assert len(sections) == 3 - assert 'while(lock0[t1] == 0)' in str(sections[1].body[0].body[0].body[0]) + assert len(sections) == 4 + assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0]) op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 45347bf605..efed9f79db 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -479,8 +479,8 @@ def test_issue_1838(): op1.apply(time_M=3, dt=1., p0=p1) # Check generated code - assert "r0L0(x, y, z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) - assert "r4L0(x, y, z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) + assert "r0L0(x,y,z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1) + assert "r4L0(x,y,z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1) assert np.allclose(p0.data, p1.data, rtol=1e-6) diff --git a/tests/test_mpi.py b/tests/test_mpi.py index 8639a435e4..bf49ba8014 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -2079,7 +2079,8 @@ def test_cire(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+1, y+1])*3.*f + _R(u[t, x+2, y+2] + u[t, x+3, y+3])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 @@ -2112,7 +2113,8 @@ def test_cire_with_shifted_diagonal_halo_touch(self): eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+2, y])*3.*f + _R(u[t, x+1, y+1] + u[t, x+3, y+1])*3.*f) + 1.) op0 = Operator(eqn, opt='noop') - op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0})) + op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0, + 'cire-schedule': 1})) assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1 diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index 27d40c93c2..b957a058c4 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -292,6 +292,18 @@ class BarCast(Cast): assert v != v1 +def test_findexed(): + grid = Grid(shape=(3, 3, 3)) + f = Function(name='f', grid=grid) + + fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) + new_fi = fi.func(strides=(3, 4)) + + assert new_fi.name == fi.name == 'f' + assert new_fi.indices == fi.indices + assert new_fi.strides == (3, 4) + + def test_symbolic_printing(): b = Symbol('b') @@ -304,17 +316,11 @@ class MyLocalObject(LocalObject, Expr): lo = MyLocalObject(name='lo') assert str(lo + 2) == '2 + lo' - -def test_findexed(): - grid = Grid(shape=(3, 3, 3)) - f = Function(name='f', grid=grid) - + grid = Grid((10,)) + f = Function(name="f", grid=grid) fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2)) - new_fi = fi.func(strides=(3, 4)) - - assert new_fi.name == fi.name == 'f' - assert new_fi.indices == fi.indices - assert new_fi.strides == (3, 4) + df = DefFunction('aaa', arguments=[fi]) + assert ccode(df) == 'aaa(foo(x))' def test_is_on_grid(): diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py index d9d01b6fc9..7d0391a85a 100644 --- a/tests/test_unexpansion.py +++ b/tests/test_unexpansion.py @@ -5,7 +5,23 @@ from devito.types import Symbol -class TestBasic(object): +class TestLoopScheduling(object): + + def test_backward_dt2(self): + grid = Grid(shape=(4, 4)) + + f = Function(name='f', grid=grid) + v = TimeFunction(name='v', grid=grid, time_order=2) + + eqns = [Eq(v.backward, v + 1.), + Eq(f, v.dt2)] + + op = Operator(eqns, opt=('advanced', {'openmp': True, + 'expand': False})) + assert_structure(op, ['t,x,y'], 't,x,y') + + +class Test1Pass(object): def test_v0(self): grid = Grid(shape=(10, 10, 10)) @@ -85,7 +101,8 @@ def test_v2(self): op0 = Operator(eqns) op1 = Operator(eqns, opt=('advanced', {'expand': False, - 'blocklevels': 0})) + 'blocklevels': 0, + 'cire-mingain': 200})) # Check generated code -- expect maximal fusion! assert_structure(op1, @@ -110,7 +127,8 @@ def test_v3(self): Eq(v.forward, (v + u.dx.dy + 1.))] op0 = Operator(eqns) - op1 = Operator(eqns, opt=('advanced', {'expand': False})) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check generated code -- redundant IndexDerivatives have been caught! op1._profiler._sections['section0'].sops == 65 @@ -123,63 +141,12 @@ def test_v3(self): def test_v4(self): grid = Grid(shape=(16, 16, 16)) - t = grid.stepping_dim - x, y, z = grid.dimensions - - so = 4 - - a = Function(name='a', grid=grid, space_order=so) - f = Function(name='f', grid=grid, space_order=so) - e = Function(name='e', grid=grid, space_order=so) - r = Function(name='r', grid=grid, space_order=so) - p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) - m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) - - def g1(field, r, e): - return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - - sin(e) * field.dz(x0=z+z.spacing/2)) - - def g2(field, r, e): - return - (sin(r) * field.dx(x0=x+x.spacing/2) - - cos(r) * field.dy(x0=y+y.spacing/2)) - - def g3(field, r, e): - return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + - sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + - cos(e) * field.dz(x0=z+z.spacing/2)) - - def g1_tilde(field, r, e): - return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - - (sin(e) * field).dz(x0=z-z.spacing/2)) - - def g2_tilde(field, r, e): - return - ((sin(r) * field).dx(x0=x-x.spacing/2) - - (cos(r) * field).dy(x0=y-y.spacing/2)) - - def g3_tilde(field, r, e): - return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + - (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + - (cos(e) * field).dz(x0=z-z.spacing/2)) - - update_p = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(p0, r, e), r, e) + - g2_tilde(f * g2(p0, r, e), r, e) + - g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ - (2 - t.spacing * a) - - update_m = t.spacing**2 * a**2 / f * \ - (g1_tilde(f * g1(m0, r, e), r, e) + - g2_tilde(f * g2(m0, r, e), r, e) + - g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ - (2 - t.spacing * a) - - eqns = [Eq(p0.forward, update_p), - Eq(m0.forward, update_m)] + + eqns = tti_sa_eqns(grid) op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 400})) # Check code generation assert op._profiler._sections['section1'].sops == 1442 @@ -203,7 +170,8 @@ def test_v5(self): Eq(m0.forward, m0.dx.dx + m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 127 @@ -227,10 +195,145 @@ def test_v6(self): Eq(m0.forward, (m0.dx + s0).dx + f*m0.backward)] op = Operator(eqns, subs=grid.spacing_map, - opt=('advanced', {'expand': False})) + opt=('advanced', {'expand': False, + 'cire-mingain': 200})) # Check code generation assert op._profiler._sections['section0'].sops == 133 assert_structure(op, ['t,x,y', 't,x,y,i1', 't,x,y,i1,i0'], 't,x,y,i1,i0') op.cfunction + + def test_transpose(self): + shape = (10, 10, 10) + grid = Grid(shape=shape) + x, _, _ = grid.dimensions + + u = TimeFunction(name='u', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + + # Chessboard-like init + u.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + u1.data[:] = np.indices(shape).sum(axis=0) % 10 + 1 + + eqn = Eq(u.forward, u.dx(x0=x+x.spacing/2).T + 1.) + + op0 = Operator(eqn) + op1 = Operator(eqn, opt=('advanced', {'expand': False})) + + op0.apply(time_M=10) + op1.apply(time_M=10, u=u1) + + assert np.allclose(u.data, u1.data, rtol=10e-6) + + +class Test2Pass(object): + + def test_v0(self): + grid = Grid(shape=(10, 10, 10)) + + u = TimeFunction(name='u', grid=grid, space_order=4) + v = TimeFunction(name='v', grid=grid, space_order=4) + u1 = TimeFunction(name='u', grid=grid, space_order=4) + v1 = TimeFunction(name='v', grid=grid, space_order=4) + + eqns = [Eq(u.forward, (u.dx.dy + v*u + 1.)), + Eq(v.forward, (v + u.dx.dy + 1.))] + + op0 = Operator(eqns) + op1 = Operator(eqns, opt=('advanced', {'expand': False, + 'openmp': True})) + + # Check generated code + op1._profiler._sections['section0'].sops == 65 + assert_structure(op1, ['t', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 't,x0_blk0,y0_blk0,x,y,z,i0,y,z,i1') + + op0.apply(time_M=5) + op1.apply(time_M=5, u=u1, v=v1) + + assert np.allclose(u.data, u1.data, rtol=10e-3) + assert np.allclose(v.data, v1.data, rtol=10e-3) + + def test_v1(self): + grid = Grid(shape=(16, 16, 16)) + + eqns = tti_sa_eqns(grid) + + op = Operator(eqns, subs=grid.spacing_map, + opt=('advanced', {'expand': False, + 'openmp': False})) + + # Check code generation + assert op._profiler._sections['section1'].sops == 190 + assert_structure(op, ['x,y,z', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i0', + 't,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y,z,i1'], + 'x,y,z,t,x0_blk0,y0_blk0,x,y,z,i0,x,y,z,i1') + + op.cfunction + + +def tti_sa_eqns(grid): + t = grid.stepping_dim + x, y, z = grid.dimensions + + so = 4 + + a = Function(name='a', grid=grid, space_order=so) + f = Function(name='f', grid=grid, space_order=so) + e = Function(name='e', grid=grid, space_order=so) + r = Function(name='r', grid=grid, space_order=so) + p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so) + m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so) + + def g1(field, r, e): + return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) - + sin(e) * field.dz(x0=z+z.spacing/2)) + + def g2(field, r, e): + return - (sin(r) * field.dx(x0=x+x.spacing/2) - + cos(r) * field.dy(x0=y+y.spacing/2)) + + def g3(field, r, e): + return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) + + sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) + + cos(e) * field.dz(x0=z+z.spacing/2)) + + def g1_tilde(field, r, e): + return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) - + (sin(e) * field).dz(x0=z-z.spacing/2)) + + def g2_tilde(field, r, e): + return - ((sin(r) * field).dx(x0=x-x.spacing/2) - + (cos(r) * field).dy(x0=y-y.spacing/2)) + + def g3_tilde(field, r, e): + return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) + + (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) + + (cos(e) * field).dz(x0=z-z.spacing/2)) + + update_p = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(p0, r, e), r, e) + + g2_tilde(f * g2(p0, r, e), r, e) + + g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \ + (2 - t.spacing * a) + + update_m = t.spacing**2 * a**2 / f * \ + (g1_tilde(f * g1(m0, r, e), r, e) + + g2_tilde(f * g2(m0, r, e), r, e) + + g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \ + (2 - t.spacing * a) + + eqns = [Eq(p0.forward, update_p), + Eq(m0.forward, update_m)] + + return eqns