diff --git a/devito/core/operator.py b/devito/core/operator.py
index 2de585a11e..887c79978a 100644
--- a/devito/core/operator.py
+++ b/devito/core/operator.py
@@ -329,9 +329,9 @@ class OptOption(object):
 
 class ParTileArg(tuple):
 
-    def __new__(cls, items, shm=0, tag=None):
+    def __new__(cls, items, rule=None, tag=None):
         obj = super().__new__(cls, items)
-        obj.shm = shm
+        obj.rule = rule
         obj.tag = tag
         return obj
 
@@ -371,14 +371,15 @@ def __new__(cls, items, default=None):
 
                 try:
                     y = items[1]
-                    if is_integer(y):
-                        # E.g., ((32, 4, 8), 1)
-                        # E.g., ((32, 4, 8), 1, 'tag')
+                    if is_integer(y) or isinstance(y, str) or y is None:
+                        # E.g., ((32, 4, 8), 'rule')
+                        # E.g., ((32, 4, 8), 'rule', 'tag')
                         items = (ParTileArg(*items),)
                     else:
                         try:
-                            # E.g., (((32, 4, 8), 1), ((32, 4, 4), 2))
-                            # E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1'))
+                            # E.g., (((32, 4, 8), 'rule'), ((32, 4, 4), 'rule'))
+                            # E.g., (((32, 4, 8), 'rule0', 'tag0'),
+                            #        ((32, 4, 4), 'rule1', 'tag1'))
                             items = tuple(ParTileArg(*i) for i in items)
                         except TypeError:
                             # E.g., ((32, 4, 8), (32, 4, 4))
diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py
index 1ca7e29506..ed4bab957a 100644
--- a/devito/finite_differences/differentiable.py
+++ b/devito/finite_differences/differentiable.py
@@ -346,6 +346,10 @@ def __new__(cls, *args, **kwargs):
         return obj
 
     def subs(self, *args, **kwargs):
+        if len(args) == 2:
+            old, new = args
+            if self == old:
+                return new
         return self.func(*[getattr(a, 'subs', lambda x: a)(*args, **kwargs)
                            for a in self.args], evaluate=False)
 
@@ -556,6 +560,9 @@ def __repr__(self):
 
     __str__ = __repr__
 
+    def _sympystr(self, printer):
+        return str(self)
+
     def _hashable_content(self):
         return super()._hashable_content() + (self.dimensions,)
 
@@ -621,7 +628,7 @@ def __eq__(self, other):
     __hash__ = sympy.Basic.__hash__
 
     def _hashable_content(self):
-        return (self.name, self.dimension, hash(tuple(self.weights)))
+        return (self.name, self.dimension, str(self.weights))
 
     @property
     def dimension(self):
@@ -665,6 +672,10 @@ def __new__(cls, expr, mapper, **kwargs):
     def _hashable_content(self):
         return super()._hashable_content() + (self.mapper,)
 
+    @cached_property
+    def base(self):
+        return self.expr.func(*[a for a in self.expr.args if a is not self.weights])
+
     @property
     def weights(self):
         return self._weights
diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py
index e6e35f03fd..9eb4261b2b 100644
--- a/devito/finite_differences/finite_difference.py
+++ b/devito/finite_differences/finite_difference.py
@@ -207,9 +207,11 @@ def generic_derivative(expr, dim, fd_order, deriv_order, matvec=direct, x0=None,
                            matvec, x0, symbolic, expand)
 
 
-def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic, expand):
+def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic,
+                    expand):
     # The stencil indices
-    indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec, x0=x0)
+    indices, x0 = generate_indices(expr, dim, fd_order, side=side, matvec=matvec,
+                                   x0=x0)
 
     # Finite difference weights from Taylor approximation given these positions
     if symbolic:
@@ -221,15 +223,24 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, symbolic
     weights = [sympify(w).evalf(_PRECISION) for w in weights]
 
     # Transpose the FD, if necessary
-    if matvec:
-        indices = indices.scale(matvec.val)
+    indices = indices.scale(matvec.val)
 
     # Shift index due to staggering, if any
     indices = indices.shift(-(expr.indices_ref[dim] - dim))
 
+    # The user may wish to restrict expansion to selected derivatives
+    if callable(expand):
+        expand = expand(dim)
+
     if not expand and indices.expr is not None:
         weights = Weights(name='w', dimensions=indices.free_dim, initvalue=weights)
 
+        if matvec == transpose:
+            # For homogenity, always generate e.g. `x + i0` rather than `x - i0`
+            # for transpose and `x + i0` for direct
+            indices = indices.transpose()
+            weights = weights._subs(indices.free_dim, -indices.free_dim)
+
         # Inject the StencilDimension
         # E.g. `x + i*h_x` into `f(x)` s.t. `f(x + i*h_x)`
         expr = expr._subs(dim, indices.expr)
diff --git a/devito/finite_differences/tools.py b/devito/finite_differences/tools.py
index ba112a3116..5ec3629403 100644
--- a/devito/finite_differences/tools.py
+++ b/devito/finite_differences/tools.py
@@ -197,6 +197,24 @@ def scale(self, v):
 
         return IndexSet(self.dim, indices, expr=expr, fd=self.free_dim)
 
+    def transpose(self):
+        """
+        Transpose the IndexSet.
+        """
+        indices = tuple(reversed(self))
+
+        free_dim = StencilDimension(self.free_dim.name,
+                                    -self.free_dim._max,
+                                    -self.free_dim._min,
+                                    backward=True)
+
+        try:
+            expr = self.expr._subs(self.free_dim, -free_dim)
+        except AttributeError:
+            expr = None
+
+        return IndexSet(self.dim, indices, expr=expr, fd=free_dim)
+
     def shift(self, v):
         """
         Construct a new IndexSet with all indices shifted by `v`.
diff --git a/devito/ir/clusters/algorithms.py b/devito/ir/clusters/algorithms.py
index 2c0bdacc0c..d9150573c8 100644
--- a/devito/ir/clusters/algorithms.py
+++ b/devito/ir/clusters/algorithms.py
@@ -6,8 +6,8 @@
 import sympy
 
 from devito.exceptions import InvalidOperator
-from devito.ir.support import (Any, Backward, Forward, IterationSpace,
-                               PARALLEL_IF_ATOMIC, pull_dims)
+from devito.ir.support import (Any, Backward, Forward, IterationSpace, erange,
+                               pull_dims)
 from devito.ir.clusters.analysis import analyze
 from devito.ir.clusters.cluster import Cluster, ClusterGroup
 from devito.ir.clusters.visitors import Queue, QueueStateful, cluster_pass
@@ -121,10 +121,12 @@ def callback(self, clusters, prefix, backlog=None, known_break=None):
             require_break = scope.d_flow.cause & maybe_break
             if require_break:
                 backlog = [clusters[-1]] + backlog
-                # Try with increasingly smaller ClusterGroups until the ambiguity is gone
+                # Try with increasingly smaller ClusterGroups until the
+                # ambiguity is gone
                 return self.callback(clusters[:-1], prefix, backlog, require_break)
 
-        # Schedule Clusters over different IterationSpaces if this increases parallelism
+        # Schedule Clusters over different IterationSpaces if this increases
+        # parallelism
         for i in range(1, len(clusters)):
             if self._break_for_parallelism(scope, candidates, i):
                 return self.callback(clusters[:i], prefix, clusters[i:] + backlog,
@@ -146,8 +148,8 @@ def callback(self, clusters, prefix, backlog=None, known_break=None):
         if not backlog:
             return processed
 
-        # Handle the backlog -- the Clusters characterized by flow- and anti-dependences
-        # along one or more Dimensions
+        # Handle the backlog -- the Clusters characterized by flow- and
+        # anti-dependences along one or more Dimensions
         idir = {d: Any for d in known_break}
         stamp = Stamp()
         for i, c in enumerate(list(backlog)):
@@ -278,7 +280,11 @@ def callback(self, clusters, prefix):
                 size = i.function.shape_allocated[d]
                 assert is_integer(size)
 
-                mapper[size][si].add(iaf)
+                # Resolve StencilDimensions in case of unexpanded expressions
+                # E.g. `i0 + t` -> `(t - 1, t, t + 1)`
+                iafs = erange(iaf)
+
+                mapper[size][si].update(iafs)
 
         # Construct the ModuloDimensions
         mds = []
@@ -288,7 +294,8 @@ def callback(self, clusters, prefix):
                 # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so
                 # that associativity errors are consistent. This corresponds to
                 # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0
-                siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si))
+                key = lambda i: -np.inf if i - si == 0 else (i - si)
+                siafs = sorted(iafs, key=key)
 
                 for iaf in siafs:
                     name = '%s%d' % (si.name, len(mds))
@@ -451,7 +458,8 @@ def normalize_reductions(cluster, sregistry, options):
     """
     opt_mapify_reduce = options['mapify-reduce']
 
-    dims = [d for d, v in cluster.properties.items() if PARALLEL_IF_ATOMIC in v]
+    dims = [d for d in cluster.ispace.itdims
+            if cluster.properties.is_parallel_atomic(d)]
 
     if not dims:
         return cluster
diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
index 0dc3200b4f..636c224e59 100644
--- a/devito/ir/clusters/cluster.py
+++ b/devito/ir/clusters/cluster.py
@@ -8,7 +8,7 @@
                                Forward, Interval, IntervalGroup, IterationSpace,
                                DataSpace, Guards, Properties, Scope, detect_accesses,
                                detect_io, normalize_properties, normalize_syncs,
-                               sdims_min, sdims_max)
+                               minimum, maximum)
 from devito.mpi.halo_scheme import HaloScheme, HaloTouch
 from devito.symbolics import estimate_cost
 from devito.tools import as_tuple, flatten, frozendict, infer_dtype
@@ -52,13 +52,7 @@ def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None,
 
         # Normalize properties
         properties = Properties(properties or {})
-        for d in ispace.itdimensions:
-            properties = properties.add(d)
-        for i in properties:
-            for d in as_tuple(i):
-                if d not in ispace.itdimensions:
-                    properties = properties.drop(d)
-        self._properties = properties
+        self._properties = tailor_properties(properties, ispace)
 
         self._halo_scheme = halo_scheme
 
@@ -85,10 +79,7 @@ def from_clusters(cls, *clusters):
 
         guards = root.guards
 
-        properties = {}
-        for c in clusters:
-            for d, v in c.properties.items():
-                properties[d] = normalize_properties(properties.get(d, v), v)
+        properties = reduce_properties(clusters)
 
         try:
             syncs = normalize_syncs(*[c.syncs for c in clusters])
@@ -213,12 +204,10 @@ def is_dense(self):
         # at most PARALLEL_IF_PVT). This is a quick and easy check so we try it first
         try:
             pset = {PARALLEL, PARALLEL_IF_PVT}
-            grid = self.grid
-            for d in grid.dimensions:
-                if not any(pset & v for k, v in self.properties.items()
-                           if d in k._defines):
-                    raise ValueError
-            return True
+            target = set(self.grid.dimensions)
+            dims = {d for d in self.properties if d._defines & target}
+            if any(pset & self.properties[d] for d in dims):
+                return True
         except ValueError:
             pass
 
@@ -276,8 +265,8 @@ def dspace(self):
                 continue
 
             intervals = [Interval(d,
-                                  min([sdims_min(i) for i in offs]),
-                                  max([sdims_max(i) for i in offs]))
+                                  min([minimum(i) for i in offs]),
+                                  max([maximum(i) for i in offs]))
                          for d, offs in v.items()]
             intervals = IntervalGroup(intervals)
 
@@ -418,6 +407,10 @@ def scope(self):
     def ispace(self):
         return self._ispace
 
+    @cached_property
+    def properties(self):
+        return tailor_properties(reduce_properties(self), self.ispace)
+
     @cached_property
     def guards(self):
         """The guards of each Cluster in self."""
@@ -425,8 +418,10 @@ def guards(self):
 
     @cached_property
     def syncs(self):
-        """The synchronization operations of each Cluster in self."""
-        return tuple(i.syncs for i in self)
+        """
+        A view of the ClusterGroup's synchronization operations.
+        """
+        return normalize_syncs(*[c.syncs for c in self])
 
     @cached_property
     def dspace(self):
@@ -461,3 +456,26 @@ def meta(self):
             The data type and the data space of the ClusterGroup.
         """
         return (self.dtype, self.dspace)
+
+
+# *** Utils
+
+def reduce_properties(clusters):
+    properties = {}
+    for c in clusters:
+        for d, v in c.properties.items():
+            properties[d] = normalize_properties(properties.get(d, v), v)
+
+    return Properties(properties)
+
+
+def tailor_properties(properties, ispace):
+    for d in ispace.itdimensions:
+        properties = properties.add(d)
+
+    for i in properties:
+        for d in as_tuple(i):
+            if d not in ispace.itdimensions:
+                properties = properties.drop(d)
+
+    return properties
diff --git a/devito/ir/support/__init__.py b/devito/ir/support/__init__.py
index 2b20e3ab20..8a8b50bc67 100644
--- a/devito/ir/support/__init__.py
+++ b/devito/ir/support/__init__.py
@@ -1,5 +1,5 @@
-from .utils import *  # noqa
 from .vector import *  # noqa
+from .utils import *  # noqa
 from .basic import *  # noqa
 from .space import *  # noqa
 from .guards import *  # noqa
diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py
index 36ea735109..b73b49befe 100644
--- a/devito/ir/support/basic.py
+++ b/devito/ir/support/basic.py
@@ -4,15 +4,16 @@
 from sympy import S
 
 from devito.ir.support.space import Backward, IterationSpace
-from devito.ir.support.utils import AccessMode
+from devito.ir.support.utils import AccessMode, extrema
 from devito.ir.support.vector import LabeledVector, Vector
-from devito.symbolics import (retrieve_terminals, q_constant, q_affine, q_routine,
-                              q_terminal)
+from devito.symbolics import (compare_ops, retrieve_indexed, retrieve_terminals,
+                              q_constant, q_affine, q_routine, search, uxreplace)
 from devito.tools import (Tag, as_tuple, is_integer, filter_sorted, flatten,
                           memoized_meth, memoized_generator)
-from devito.types import Barrier, Dimension, DimensionTuple, Jump, Symbol
+from devito.types import (Barrier, ComponentAccess, Dimension, DimensionTuple,
+                          Jump, Symbol)
 
-__all__ = ['IterationInstance', 'TimedAccess', 'Scope']
+__all__ = ['IterationInstance', 'TimedAccess', 'Scope', 'ExprGeometry']
 
 
 class IndexMode(Tag):
@@ -236,7 +237,7 @@ def __eq__(self, other):
         # which might require expensive comparisons of Vector entries (i.e.,
         # SymPy expressions)
 
-        return (self.access is other.access and  # => self.function is other.function
+        return (self.access is other.access and
                 self.mode == other.mode and
                 self.ispace == other.ispace)
 
@@ -322,6 +323,12 @@ def distance(self, other):
         other : TimedAccess
             The TimedAccess w.r.t. which the distance is computed.
         """
+        if isinstance(self.access, ComponentAccess) and \
+           isinstance(other.access, ComponentAccess) and \
+           self.access.index != other.access.index:
+            # E.g., `uv(x).x` and `uv(x).y` -- not a real dependence!
+            return Vector(S.ImaginaryUnit)
+
         ret = []
         for sit, oit in zip(self.itintervals, other.itintervals):
             n = len(ret)
@@ -810,9 +817,9 @@ def __init__(self, exprs, rules=None):
 
         for i, e in enumerate(exprs):
             # Reads
-            terminals = retrieve_terminals(e.rhs, deep=True, mode='unique')
+            terminals = retrieve_accesses(e.rhs, deep=True)
             try:
-                terminals.update(retrieve_terminals(e.lhs.indices))
+                terminals.update(retrieve_accesses(e.lhs.indices))
             except AttributeError:
                 pass
             for j in terminals:
@@ -821,12 +828,10 @@ def __init__(self, exprs, rules=None):
                 v.append(TimedAccess(j, mode, i, e.ispace))
 
             # Write
-            terminals = []
-            if q_terminal(e.lhs):
-                terminals.append(e.lhs)
+            terminals = retrieve_accesses(e.lhs)
             if q_routine(e.rhs):
                 try:
-                    terminals.extend(e.rhs.writes)
+                    terminals.update(e.rhs.writes)
                 except AttributeError:
                     # E.g., foreign routines, such as `cos` or `sin`
                     pass
@@ -846,7 +851,7 @@ def __init__(self, exprs, rules=None):
 
             # Look up ConditionalDimensions
             for v in e.conditionals.values():
-                for j in retrieve_terminals(v):
+                for j in retrieve_accesses(v):
                     v = self.reads.setdefault(j.function, [])
                     v.append(TimedAccess(j, 'R', -1, e.ispace))
 
@@ -860,7 +865,7 @@ def __init__(self, exprs, rules=None):
         # Objects altering the control flow (e.g., synchronization barriers,
         # break statements, ...) are converted into mock dependences
         for i, e in enumerate(exprs):
-            if e.find(Barrier) | e.find(Jump):
+            if isinstance(e.rhs, (Barrier, Jump)):
                 self.writes.setdefault(mocksym, []).append(
                     TimedAccess(mocksym, 'W', i, e.ispace)
                 )
@@ -883,7 +888,8 @@ def __getitem__(self, function):
         return self.getwrites(function) + self.getreads(function)
 
     def __repr__(self):
-        tracked = filter_sorted(set(self.reads) | set(self.writes), key=lambda i: i.name)
+        tracked = filter_sorted(set(self.reads) | set(self.writes),
+                                key=lambda i: i.name)
         maxlen = max(1, max([len(i.name) for i in tracked]))
         out = "{:>%d} =>  W : {}\n{:>%d}     R : {}" % (maxlen, maxlen)
         pad = " "*(maxlen + 9)
@@ -908,6 +914,19 @@ def __repr__(self):
         return "\n".join([out.format(i.name, w, '', r)
                           for i, r, w in zip(tracked, reads, writes)])
 
+    @cached_property
+    def reads_extremaed(self):
+        """
+        A view of the Scope's reads in which StencilDimensions are replaced
+        with their extrema.
+        """
+        ret = {f: [] for f in self.reads}
+        for f, v in self.reads.items():
+            for i in v:
+                for j in set(extrema(i.access)):
+                    ret[f].append(TimedAccess(j, i.mode, i.timestamp, i.ispace))
+        return ret
+
     @cached_property
     def accesses(self):
         groups = list(self.reads.values()) + list(self.writes.values())
@@ -933,9 +952,12 @@ def d_flow_gen(self):
         """Generate the flow (or "read-after-write") dependences."""
         for k, v in self.writes.items():
             for w in v:
-                for r in self.reads.get(k, []):
+                for r in self.reads_extremaed.get(k, []):
                     dependence = Dependence(w, r)
 
+                    if dependence.is_imaginary:
+                        continue
+
                     if any(not rule(dependence) for rule in self.rules):
                         continue
 
@@ -946,8 +968,7 @@ def d_flow_gen(self):
                         # Non-integer vectors are not comparable.
                         # Conservatively, we assume it is a dependence, unless
                         # it's a read-for-reduction
-                        is_flow = (not dependence.is_imaginary and
-                                   not r.is_read_reduction)
+                        is_flow = not r.is_read_reduction
                     if is_flow:
                         yield dependence
 
@@ -961,9 +982,12 @@ def d_anti_gen(self):
         """Generate the anti (or "write-after-read") dependences."""
         for k, v in self.writes.items():
             for w in v:
-                for r in self.reads.get(k, []):
+                for r in self.reads_extremaed.get(k, []):
                     dependence = Dependence(r, w)
 
+                    if dependence.is_imaginary:
+                        continue
+
                     if any(not rule(dependence) for rule in self.rules):
                         continue
 
@@ -974,8 +998,7 @@ def d_anti_gen(self):
                         # Non-integer vectors are not comparable.
                         # Conservatively, we assume it is a dependence, unless
                         # it's a read-for-reduction
-                        is_anti = (not dependence.is_imaginary and
-                                   not r.is_read_reduction)
+                        is_anti = not r.is_read_reduction
                     if is_anti:
                         yield dependence
 
@@ -992,6 +1015,9 @@ def d_output_gen(self):
                 for w2 in self.writes.get(k, []):
                     dependence = Dependence(w2, w1)
 
+                    if dependence.is_imaginary:
+                        continue
+
                     if any(not rule(dependence) for rule in self.rules):
                         continue
 
@@ -1001,7 +1027,7 @@ def d_output_gen(self):
                     except TypeError:
                         # Non-integer vectors are not comparable.
                         # Conservatively, we assume it is a dependence
-                        is_output = not dependence.is_imaginary
+                        is_output = True
                     if is_output:
                         yield dependence
 
@@ -1028,7 +1054,7 @@ def d_from_access_gen(self, accesses):
         accesses = as_tuple(accesses)
         for d in self.d_all_gen():
             for i in accesses:
-                if d.source is i or d.sink is i:
+                if d.source == i or d.sink == i:
                     yield d
                     break
 
@@ -1064,3 +1090,147 @@ def r_all(self):
         All Relations of the Scope.
         """
         return list(self.r_gen())
+
+
+class ExprGeometry(object):
+
+    """
+    Geometric representation of an expression by abstracting Indexeds as
+    LabeledVectors.
+    """
+
+    def __init__(self, expr, indexeds=None, bases=None, offsets=None):
+        self.expr = expr
+
+        if indexeds is not None:
+            self.indexeds = indexeds
+            self.bases = bases
+            self.offsets = offsets
+            return
+
+        self.indexeds = retrieve_indexed(expr)
+
+        bases = []
+        offsets = []
+        for ii in self.iinstances:
+            base = []
+            offset = []
+            for e, fi, ai in zip(ii, ii.findices, ii.aindices):
+                if ai is None:
+                    base.append((fi, e))
+                else:
+                    base.append((fi, ai))
+                    offset.append((ai, e - ai))
+            bases.append(LabeledVector(base))
+            offsets.append(LabeledVector(offset))
+
+        self.bases = bases
+        self.offsets = offsets
+
+    def __repr__(self):
+        return "ExprGeometry(expr=%s)" % self.expr
+
+    def translated(self, other, dims=None):
+        """
+        True if `self` is translated w.r.t. `other`, False otherwise.
+
+        Examples
+        --------
+        Two expressions are translated if they perform the same operations,
+        their bases are the same and their offsets are pairwise translated.
+
+        c := A[i,j] op A[i,j+1]     -> Toffsets = {i: [0,0], j: [0,1]}
+        u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]}
+
+        Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}`
+
+        The test may be strengthen by imposing that a translation occurs
+        only along a specific set of Dimensions through the kwarg `dims`.
+        """
+        # Check mathematical structure
+        if not compare_ops(self.expr, other.expr):
+            return False
+
+        # Use a suitable value for `dims` if not provided by user
+        if dims is None:
+            if self.aindices != other.aindices:
+                return False
+            dims = self.aindices
+        dims = set(as_tuple(dims))
+
+        # Check bases and offsets
+        for i in ['Tbases', 'Toffsets']:
+            Ti0 = getattr(self, i)
+            Ti1 = getattr(other, i)
+
+            m0 = dict(Ti0)
+            m1 = dict(Ti1)
+
+            # The only hope in presence of Dimensions appearing only in either
+            # `self` or `other` is that they have been projected away by the caller
+            for d in set(m0).symmetric_difference(set(m1)):
+                if not d._defines & dims:
+                    return False
+
+            for d in set(m0).union(set(m1)):
+                try:
+                    o0 = m0[d]
+                    o1 = m1[d]
+                except KeyError:
+                    continue
+
+                distance = set(o0 - o1)
+                if len(distance) != 1:
+                    return False
+
+                if not d._defines & dims:
+                    if distance.pop() != 0:
+                        return False
+
+        return True
+
+    @cached_property
+    def iinstances(self):
+        return tuple(IterationInstance(i) for i in self.indexeds)
+
+    @cached_property
+    def Tbases(self):
+        return LabeledVector.transpose(*self.bases)
+
+    @cached_property
+    def Toffsets(self):
+        return LabeledVector.transpose(*self.offsets)
+
+    @cached_property
+    def dimensions(self):
+        return frozenset(i for i, _ in self.Toffsets)
+
+    @cached_property
+    def aindices(self):
+        try:
+            return tuple(zip(*self.Toffsets))[0]
+        except IndexError:
+            return ()
+
+    @property
+    def is_regular(self):
+        return all(i.is_regular for i in self.iinstances)
+
+
+# *** Utils
+
+def retrieve_accesses(exprs, **kwargs):
+    """
+    Like retrieve_terminals, but ensure that if a ComponentAccess is found,
+    the ComponentAccess itself is returned, while the wrapped Indexed is discarded.
+    """
+    kwargs['mode'] = 'unique'
+
+    compaccs = search(exprs, ComponentAccess)
+    if not compaccs:
+        return retrieve_terminals(exprs, **kwargs)
+
+    subs = {i: Symbol('dummy%d' % n) for n, i in enumerate(compaccs)}
+    exprs1 = uxreplace(exprs, subs)
+
+    return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values())
diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py
index dd1bcc01a5..a378929fbc 100644
--- a/devito/ir/support/guards.py
+++ b/devito/ir/support/guards.py
@@ -8,7 +8,7 @@
 
 from devito.ir.support.space import Forward, IterationDirection
 from devito.symbolics import CondEq, CondNe
-from devito.tools import Pickable, as_tuple, frozendict
+from devito.tools import Pickable, as_tuple, frozendict, split
 from devito.types import Dimension
 
 __all__ = ['GuardFactor', 'GuardBound', 'GuardBoundNext', 'BaseGuardBound',
@@ -228,7 +228,7 @@ def andg(self, d, guard):
             return Guards(m)
 
         try:
-            m[d] = And(m[d], guard)
+            m[d] = and_smart(m[d], guard)
         except KeyError:
             m[d] = guard
 
@@ -269,3 +269,40 @@ def filter(self, key):
         m = {d: v for d, v in self.items() if key(d)}
 
         return Guards(m)
+
+
+# *** Utils
+
+def and_smart(relation, v):
+    """
+    Given `x = And(*relation.args, v)`, return `relation` if `x ≡ relation`,
+    `x` otherwise.
+
+    SymPy doesn't have a builtin function to simplify boolean inequalities; here,
+    we run a set of simple checks to at least discard the most obvious (and thus
+    annoying to see in the generated code) redundancies.
+    """
+    if isinstance(relation, And):
+        candidates, other = split(list(relation.args), lambda a: type(a) is type(v))
+    elif type(relation) is type(v):
+        candidates, other = [relation], []
+    else:
+        candidates, other = [], [relation, v]
+
+    covered = False
+    new_args = []
+    for a in candidates:
+        if a.lhs is v.lhs:
+            covered = True
+            if type(a) in (Gt, Ge) and v.rhs > a.rhs:
+                new_args.append(v)
+            elif type(a) in (Lt, Le) and v.rhs < a.rhs:
+                new_args.append(v)
+            else:
+                new_args.append(a)
+        else:
+            new_args.append(a)
+    if not covered:
+        new_args.append(v)
+
+    return And(*(new_args + other))
diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py
index f4ab873575..dfc7a7d4c2 100644
--- a/devito/ir/support/properties.py
+++ b/devito/ir/support/properties.py
@@ -208,6 +208,9 @@ def is_parallel(self, dims):
     def is_parallel_relaxed(self, dims):
         return any(len(self[d] & PARALLELS) > 0 for d in as_tuple(dims))
 
+    def is_parallel_atomic(self, dims):
+        return any(PARALLEL_IF_ATOMIC in self.get(d, ()) for d in as_tuple(dims))
+
     def is_affine(self, dims):
         return any(AFFINE in self.get(d, ()) for d in as_tuple(dims))
 
diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py
index b85b017b4a..529258bada 100644
--- a/devito/ir/support/space.py
+++ b/devito/ir/support/space.py
@@ -6,7 +6,7 @@
 from cached_property import cached_property
 from sympy import Expr
 
-from devito.ir.support.utils import sdims_min, sdims_max
+from devito.ir.support.utils import minimum, maximum
 from devito.ir.support.vector import Vector, vmin, vmax
 from devito.tools import (PartialOrderTuple, Stamp, as_list, as_tuple, filter_ordered,
                           flatten, frozendict, is_integer, toposort)
@@ -286,8 +286,9 @@ def promote(self, cond):
         return self
 
     def expand(self):
-        return Interval(self.dim, sdims_min(self.lower), sdims_max(self.upper),
-                        self.stamp)
+        return Interval(
+            self.dim, minimum(self.lower), maximum(self.upper), self.stamp
+        )
 
 
 class IntervalGroup(PartialOrderTuple):
@@ -904,6 +905,18 @@ def promote(self, cond):
 
         return IterationSpace(intervals, sub_iterators, directions)
 
+    def prefix(self, key):
+        """
+        Return the IterationSpace up to and including the last Interval
+        such that `key(interval)` is True.
+        """
+        try:
+            i = self.project(key)[-1]
+        except IndexError:
+            return None
+
+        return self[:self.index(i.dim) + 1]
+
     def is_compatible(self, other):
         """
         A relaxed version of ``__eq__``, in which only non-derived dimensions
@@ -916,6 +929,10 @@ def is_compatible(self, other):
     def itdimensions(self):
         return self.intervals.dimensions
 
+    @property
+    def itdims(self):
+        return self.itdimensions
+
     @property
     def relations(self):
         return self.intervals.relations
diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py
index 5f08f48020..2b50073d2d 100644
--- a/devito/ir/support/utils.py
+++ b/devito/ir/support/utils.py
@@ -1,12 +1,16 @@
-from collections import defaultdict
+from collections import defaultdict, namedtuple
+from itertools import product
 
-from devito.symbolics import CallFromPointer, retrieve_indexed, retrieve_terminals
+from devito.finite_differences import IndexDerivative
+from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals,
+                              search)
 from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split
 from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension,
                           StencilDimension)
 
 __all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io',
-           'pull_dims', 'sdims_min', 'sdims_max']
+           'pull_dims', 'unbounded', 'minimum', 'maximum', 'minmax_index',
+           'extrema', 'erange']
 
 
 class AccessMode(object):
@@ -271,25 +275,91 @@ def pull_dims(exprs, flag=True):
 
 # *** Utility functions for expressions that potentially contain StencilDimensions
 
-def sdims_min(expr):
+def unbounded(expr):
     """
-    Replace all StencilDimensions in `expr` with their minimum point.
+    Retrieve all unbounded Dimensions in `expr`.
     """
-    try:
-        sdims = expr.find(StencilDimension)
-    except AttributeError:
-        return expr
-    mapper = {e: e._min for e in sdims}
-    return expr.subs(mapper)
+    # At the moment we only have logic to retrieve unbounded StencilDimensions,
+    # but in the future this might change
+    bound = set().union(*[i.dimensions for i in search(expr, IndexDerivative)])
+    sdims = search(expr, StencilDimension, mode='unique', deep=True)
+
+    return sdims - bound
+
+
+Extrema = namedtuple('Extrema', 'm M')
 
 
-def sdims_max(expr):
+def _minmax(expr, callback, udims=None):
     """
-    Replace all StencilDimensions in `expr` with their maximum point.
+    Helper for `minimum`, `maximum`, and potential future utilities that share
+    a significant chunk of logic.
     """
-    try:
-        sdims = expr.find(StencilDimension)
-    except AttributeError:
+    if not udims:
+        udims = unbounded(expr)
+
+    # Resolution rule 1: StencilDimensions
+    sdims = [d for d in udims if d.is_Stencil]
+    if not sdims:
         return expr
-    mapper = {e: e._max for e in sdims}
+    mapper = {e: callback(e) for e in sdims}
+
     return expr.subs(mapper)
+
+
+def minimum(expr, udims=None):
+    """
+    Substitute the unbounded Dimensions in `expr` with their minimum point.
+
+    Unbounded Dimensions whose possible minimum value is not known are ignored.
+    """
+    return _minmax(expr, lambda e: e._min, udims)
+
+
+def maximum(expr, udims=None):
+    """
+    Substitute the unbounded Dimensions in `expr` with their maximum point.
+
+    Unbounded Dimensions whose possible maximum value is not known are ignored.
+    """
+    return _minmax(expr, lambda e: e._max, udims)
+
+
+def extrema(expr):
+    """
+    The minimum and maximum extrema assumed by `expr` once the unbounded
+    Dimensions are resolved.
+    """
+    return Extrema(minimum(expr), maximum(expr))
+
+
+def minmax_index(expr, d):
+    """
+    Return the minimum and maximum indices along the `d` Dimension
+    among all Indexeds in `expr`.
+    """
+    indices = set()
+    for i in retrieve_indexed(expr):
+        try:
+            indices.add(i.indices[d])
+        except KeyError:
+            pass
+
+    return Extrema(min(minimum(i) for i in indices),
+                   max(maximum(i) for i in indices))
+
+
+def erange(expr):
+    """
+    All possible values that `expr` can assume once its unbounded Dimensions
+    are resolved.
+    """
+    udims = unbounded(expr)
+    if not udims:
+        return (expr,)
+
+    sdims = [d for d in udims if d.is_Stencil]
+    ranges = [i.range for i in sdims]
+    mappers = [dict(zip(sdims, i)) for i in product(*ranges)]
+
+    return tuple(expr.subs(m) for m in mappers)
diff --git a/devito/operator/operator.py b/devito/operator/operator.py
index 26202a419b..7b31ba65db 100644
--- a/devito/operator/operator.py
+++ b/devito/operator/operator.py
@@ -311,8 +311,15 @@ def _lower_exprs(cls, expressions, **kwargs):
         # Specialization is performed on unevaluated expressions
         expressions = cls._specialize_dsl(expressions, **kwargs)
 
-        # Lower functional DSL
+        # Lower FD derivatives
+        # NOTE: we force expansion of derivatives along SteppingDimensions
+        # because it drastically simplifies the subsequent lowering into
+        # ModuloDimensions
+        if not expand:
+            expand = lambda d: d.is_Stepping
         expressions = flatten([i._evaluate(expand=expand) for i in expressions])
+
+        # Scalarize the tensor equations, if any
         expressions = [j for i in expressions for j in i._flatten]
 
         # A second round of specialization is performed on evaluated expressions
@@ -922,15 +929,22 @@ def _emit_apply_profiling(self, args):
 
         # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
         # thing that will be emitted
-        for k, v in summary.items():
-            rank = "[rank%d]" % k.rank if k.rank is not None else ""
+        def lower_perfentry(v):
             if v.gflopss:
                 oi = "OI=%.2f" % fround(v.oi)
                 gflopss = "%.2f GFlops/s" % fround(v.gflopss)
                 gpointss = "%.2f GPts/s" % fround(v.gpointss)
-                metrics = "[%s]" % ", ".join([oi, gflopss, gpointss])
+                return "[%s]" % ", ".join([oi, gflopss, gpointss])
+            elif v.gpointss:
+                gpointss = "%.2f GPts/s" % fround(v.gpointss)
+                return "[%s]" % gpointss
             else:
-                metrics = ""
+                return ""
+
+        for k, v in summary.items():
+            rank = "[rank%d]" % k.rank if k.rank is not None else ""
+
+            metrics = lower_perfentry(v)
 
             itershapes = [",".join(str(i) for i in its) for its in v.itershapes]
             if len(itershapes) > 1:
@@ -942,9 +956,12 @@ def _emit_apply_profiling(self, args):
             name = "%s%s<%s>" % (k.name, rank, itershapes)
 
             perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics))
-            for n, time in summary.subsections.get(k.name, {}).items():
-                perf("%s+ %s ran in %.2f s [%.2f%%]" %
-                     (indent*2, n, time, fround(time/v.time*100)))
+            for n, v1 in summary.subsections.get(k.name, {}).items():
+                metrics = lower_perfentry(v1)
+
+                perf("%s+ %s ran in %.2f s [%.2f%%] %s" %
+                     (indent*2, n, fround(v1.time), fround(v1.time/v.time*100),
+                      metrics))
 
         # Emit performance mode and arguments
         perf_args = {}
diff --git a/devito/operator/profiling.py b/devito/operator/profiling.py
index 2a58677481..53f16e526f 100644
--- a/devito/operator/profiling.py
+++ b/devito/operator/profiling.py
@@ -105,6 +105,25 @@ def track_subsection(self, sname, name):
         v = self._subsections.setdefault(sname, OrderedDict())
         v[name] = SectionData(S.Zero, S.Zero, S.Zero, S.Zero, [])
 
+    def group_as_subsections(self, sname, sections):
+        ops = sum(self._sections[i].ops for i in sections)
+        points = sum(self._sections[i].points for i in sections)
+        traffic = sum(self._sections[i].traffic for i in sections)
+        sectiondata = SectionData(ops, S.Zero, points, traffic, [])
+
+        v = self._subsections.setdefault(sname, OrderedDict())
+        v.update({i: self._sections[i] for i in sections})
+
+        new_sections = OrderedDict()
+        for k, v in self._sections.items():
+            try:
+                if sections.index(k) == len(sections) - 1:
+                    new_sections[sname] = sectiondata
+            except ValueError:
+                new_sections[k] = v
+        self._sections.clear()
+        self._sections.update(new_sections)
+
     def instrument(self, iet, timer):
         """
         Instrument the given IET for C-level performance profiling.
@@ -211,6 +230,58 @@ class AdvancedProfiler(Profiler):
 
     _supports_async_sections = True
 
+    def _evaluate_section(self, name, data, args, dtype):
+        # Time to run the section
+        time = max(getattr(args[self.name]._obj, name), 10e-7)
+
+        # Number of FLOPs performed
+        try:
+            ops = int(subs_op_args(data.ops, args))
+        except (AttributeError, TypeError):
+            # E.g., a section comprising just function calls, or at least
+            # a sequence of unrecognized or non-conventional expr statements
+            ops = np.nan
+
+        try:
+            # Number of grid points computed
+            points = int(subs_op_args(data.points, args))
+
+            # Compulsory traffic
+            traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize)
+        except (AttributeError, TypeError):
+            # E.g., the section has a dynamic loop size
+            points = np.nan
+
+            traffic = np.nan
+
+        # Nmber of FLOPs performed at each iteration
+        sops = data.sops
+
+        # Runtime itermaps/itershapes
+        try:
+            itermaps = [OrderedDict([(k, int(subs_op_args(v, args)))
+                                     for k, v in i.items()])
+                        for i in data.itermaps]
+            itershapes = tuple(tuple(i.values()) for i in itermaps)
+        except TypeError:
+            # E.g., a section comprising just function calls, or at least
+            # a sequence of unrecognized or non-conventional expr statements
+            itershapes = ()
+
+        return time, ops, points, traffic, sops, itershapes
+
+    def _allgather_from_comm(self, comm, time, ops, points, traffic, sops, itershapes):
+        times = comm.allgather(time)
+        assert comm.size == len(times)
+
+        opss = comm.allgather(ops)
+        pointss = comm.allgather(points)
+        traffics = comm.allgather(traffic)
+        sops = [sops]*comm.size
+        itershapess = comm.allgather(itershapes)
+
+        return list(zip(times, opss, pointss, traffics, sops, itershapess))
+
     # Override basic summary so that arguments other than runtime are computed.
     def summary(self, args, dtype, reduce_over=None):
         grid = args.grid
@@ -219,71 +290,30 @@ def summary(self, args, dtype, reduce_over=None):
         # Produce sections summary
         summary = PerformanceSummary()
         for name, data in self._sections.items():
-            # Time to run the section
-            time = max(getattr(args[self.name]._obj, name), 10e-7)
-
-            # Number of FLOPs performed
-            try:
-                ops = int(subs_op_args(data.ops, args))
-            except (AttributeError, TypeError):
-                # E.g., a section comprising just function calls, or at least
-                # a sequence of unrecognized or non-conventional expr statements
-                ops = np.nan
-
-            try:
-                # Number of grid points computed
-                points = int(subs_op_args(data.points, args))
-
-                # Compulsory traffic
-                traffic = float(subs_op_args(data.traffic, args)*dtype().itemsize)
-            except (AttributeError, TypeError):
-                # E.g., the section has a dynamic loop size
-                points = np.nan
-
-                traffic = np.nan
-
-            # Runtime itermaps/itershapes
-            try:
-                itermaps = [OrderedDict([(k, int(subs_op_args(v, args)))
-                                         for k, v in i.items()])
-                            for i in data.itermaps]
-                itershapes = tuple(tuple(i.values()) for i in itermaps)
-            except TypeError:
-                # E.g., a section comprising just function calls, or at least
-                # a sequence of unrecognized or non-conventional expr statements
-                itershapes = ()
+            items = self._evaluate_section(name, data, args, dtype)
 
             # Add local performance data
             if comm is not MPI.COMM_NULL:
                 # With MPI enabled, we add one entry per section per rank
-                times = comm.allgather(time)
-                assert comm.size == len(times)
-                opss = comm.allgather(ops)
-                pointss = comm.allgather(points)
-                traffics = comm.allgather(traffic)
-                sops = [data.sops]*comm.size
-                itershapess = comm.allgather(itershapes)
-                items = list(zip(times, opss, pointss, traffics, sops, itershapess))
+                items = self._allgather_from_comm(comm, *items)
                 for rank in range(comm.size):
                     summary.add(name, rank, *items[rank])
             else:
-                summary.add(name, None, time, ops, points, traffic, data.sops, itershapes)
+                summary.add(name, None, *items)
 
         # Enrich summary with subsections data
         for sname, v in self._subsections.items():
             for name, data in v.items():
-                # Time to run the section
-                time = max(getattr(args[self.name]._obj, name), 10e-7)
+                items = self._evaluate_section(name, data, args, dtype)
 
                 # Add local performance data
                 if comm is not MPI.COMM_NULL:
                     # With MPI enabled, we add one entry per section per rank
-                    times = comm.allgather(time)
-                    assert comm.size == len(times)
+                    items = self._allgather_from_comm(comm, *items)
                     for rank in range(comm.size):
-                        summary.add_subsection(sname, name, rank, time)
+                        summary.add_subsection(sname, name, rank, *items[rank])
                 else:
-                    summary.add_subsection(sname, name, None, time)
+                    summary.add_subsection(sname, name, None, *items)
 
         # Add global performance data
         if reduce_over is not None:
@@ -422,11 +452,11 @@ def add(self, name, rank, time,
 
         self.input[k] = PerfInput(time, ops, points, traffic, sops, itershapes)
 
-    def add_subsection(self, sname, name, rank, time):
+    def add_subsection(self, sname, name, rank, time, *args):
         k0 = PerfKey(sname, rank)
         assert k0 in self
 
-        self.subsections[sname][name] = time
+        self.subsections[sname][name] = PerfEntry(time, None, None, None, None, [])
 
     def add_glb_vanilla(self, key, time):
         """
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 1e8626da18..a7a1c57422 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -6,18 +6,19 @@
 import numpy as np
 import sympy
 
-from devito.finite_differences import EvalDerivative
+from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
 from devito.ir import (SEQUENTIAL, PARALLEL_IF_PVT, ROUNDABLE, SEPARABLE, Forward,
-                       IterationInstance, IterationSpace, Interval, Cluster,
-                       Queue, IntervalGroup, LabeledVector, normalize_properties,
-                       relax_properties, sdims_min, sdims_max)
-from devito.symbolics import (Uxmapper, compare_ops, estimate_cost, q_constant,
-                              reuse_if_untouched, retrieve_indexed, search, uxreplace)
-from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict, generator,
-                          split, timed_pass)
-from devito.types import (Array, TempFunction, Eq, Symbol, Temp, ModuloDimension,
-                          CustomDimension, IncrDimension, StencilDimension, Indexed,
-                          Hyperplane)
+                       IterationSpace, Interval, Cluster, ExprGeometry, Queue,
+                       IntervalGroup, LabeledVector, Vector, normalize_properties,
+                       relax_properties, unbounded, minimum, maximum, extrema,
+                       vmax, vmin)
+from devito.symbolics import (Uxmapper, estimate_cost, search, reuse_if_untouched,
+                              uxreplace)
+from devito.tools import (Stamp, as_mapper, as_tuple, flatten, frozendict,
+                          is_integer, generator, split, timed_pass)
+from devito.types import (Eq, Symbol, Temp, TempArray, TempFunction,
+                          ModuloDimension, CustomDimension, IncrDimension,
+                          StencilDimension, Indexed, Hyperplane)
 from devito.types.grid import MultiSubDimension
 
 __all__ = ['cire']
@@ -82,10 +83,13 @@ def cire(clusters, mode, sregistry, options, platform):
     t0 = 2.0*t2[x,y,z]
     t1 = 3.0*t2[x,y,z+1]
     """
-    modes = {
-        'invariants': [CireInvariantsElementary, CireInvariantsDivs],
-        'sops': [CireSops],
-    }
+    # NOTE: Handle prematurely expanded derivatives -- current default on
+    # several backends, but soon to become legacy
+    if mode == 'sops':
+        if options['expand']:
+            mode = 'eval-derivs'
+        else:
+            mode = 'index-derivs'
 
     for cls in modes[mode]:
         transformer = cls(sregistry, options, platform)
@@ -120,19 +124,17 @@ def _aliases_from_clusters(self, clusters, exclude, meta):
             # Clusters -> AliasList
             found = collect(mapper.extracted, meta.ispace, self.opt_minstorage)
             pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain)
-            if not aliases:
-                continue
 
             # AliasList -> Schedule
             schedule = lower_aliases(aliases, meta, self.opt_maxpar)
 
             variants.append(Variant(schedule, pexprs))
 
-        # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off)
         if not variants:
             return []
-        schedule, exprs = pick_best(variants, self.opt_schedule_strategy,
-                                    self._eval_variants_delta)
+
+        # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off)
+        schedule, exprs = self._select(variants)
 
         # Schedule -> Schedule (optimization)
         if self.opt_rotate:
@@ -140,7 +142,8 @@ def _aliases_from_clusters(self, clusters, exclude, meta):
         schedule = optimize_schedule_padding(schedule, meta, self.platform)
 
         # Schedule -> [Clusters]_k
-        processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps)
+        processed, subs = lower_schedule(schedule, meta, self.sregistry,
+                                         self.opt_ftemps)
 
         # [Clusters]_k -> [Clusters]_k (optimization)
         if self.opt_multisubdomain:
@@ -162,6 +165,46 @@ def _aliases_from_clusters(self, clusters, exclude, meta):
 
         return processed
 
+    def process(self, clusters):
+        raise NotImplementedError
+
+    def _generate(self, exprs, exclude):
+        """
+        Generate one or more extractions from ``exprs``. An extraction is a
+        set of CIRE candidates which may be turned into aliases. Two different
+        extractions may contain overlapping sub-expressions and, therefore,
+        should be processed and evaluated indipendently. An extraction won't
+        contain any of the symbols appearing in ``exclude``.
+        """
+        raise NotImplementedError
+
+    def _lookup_key(self, c):
+        """
+        Create a key for the given Cluster. Clusters with same key may be
+        processed together in the search for CIRE candidates. Clusters should
+        have a different key if they must not be processed together, e.g.,
+        when this would lead to violation of data dependencies.
+        """
+        raise NotImplementedError
+
+    def _select(self, variants):
+        """
+        Select the best variant out of a set of variants, weighing flops
+        and working set.
+        """
+        raise NotImplementedError
+
+    def _eval_variants_delta(self, delta_flops, delta_ws):
+        """
+        Given the deltas in flop reduction and working set size increase of two
+        Variants, return True if the second variant is estimated to deliever
+        better performance, False otherwise.
+        """
+        raise NotImplementedError
+
+
+class CireTransformerLegacy(CireTransformer):
+
     def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None):
         """
         Carry out the bulk of the work of ``_generate``.
@@ -192,38 +235,8 @@ def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None):
 
         return mapper
 
-    def _generate(self, exprs, exclude):
-        """
-        Generate one or more extractions from ``exprs``. An extraction is a
-        set of CIRE candidates which may be turned into aliases. Two different
-        extractions may contain overlapping sub-expressions and, therefore,
-        should be processed and evaluated indipendently. An extraction won't
-        contain any of the symbols appearing in ``exclude``.
-        """
-        raise NotImplementedError
-
-    def _lookup_key(self, c):
-        """
-        Create a key for the given Cluster. Clusters with same key may be
-        processed together in the search for CIRE candidates. Clusters should
-        have a different key if they must not be processed together, e.g.,
-        when this would lead to violation of data dependencies.
-        """
-        raise NotImplementedError
-
-    def _eval_variants_delta(self, delta_flops, delta_ws):
-        """
-        Given the deltas in flop reduction and working set size increase of two
-        Variants, return True if the second variant is estimated to deliever
-        better performance, False otherwise.
-        """
-        raise NotImplementedError
-
-    def process(self, clusters):
-        raise NotImplementedError
 
-
-class CireInvariants(CireTransformer, Queue):
+class CireInvariants(CireTransformerLegacy, Queue):
 
     def __init__(self, sregistry, options, platform):
         super().__init__(sregistry, options, platform)
@@ -271,6 +284,9 @@ def _lookup_key(self, c, d):
 
         return AliasKey(ispace, intervals, c.dtype, c.guards, properties)
 
+    def _select(self, variants):
+        return pick_best(variants, self._eval_variants_delta)
+
     def _eval_variants_delta(self, delta_flops, delta_ws):
         # Always prefer the Variant with fewer temporaries
         return ((delta_ws == 0 and delta_flops < 0) or
@@ -309,7 +325,7 @@ def _generate(self, exprs, exclude):
         yield self._do_generate(exprs, exclude, cbk_search)
 
 
-class CireSops(CireTransformer):
+class CireDerivatives(CireTransformerLegacy):
 
     def __init__(self, sregistry, options, platform):
         super().__init__(sregistry, options, platform)
@@ -340,55 +356,117 @@ def process(self, clusters):
         return processed
 
     def _generate(self, exprs, exclude):
-        # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]`
-        def cbk_search(expr):
-            if isinstance(expr, EvalDerivative) and not expr.base.is_Function:
-                return expr.args
-            else:
-                return flatten(e for e in [cbk_search(a) for a in expr.args] if e)
-
-        cbk_compose = lambda e: split_coeff(e)[1]
-        basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose)
+        # E.g., extract `u.dx*a*b` and `u.dx*a*c` from
+        # `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]`
+        basextr = self._do_generate(exprs, exclude, self._cbk_search,
+                                    self._cbk_compose)
         if not basextr:
             return
         yield basextr
 
         # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]`
-        # That is, attempt extracting the largest common derivative-induced subexprs
+        # I.e., attempt extracting the largest common derivative-induced subexprs
         mappers = [deindexify(e) for e in basextr.extracted]
         counter = Counter(flatten(m.keys() for m in mappers))
         groups = as_mapper(counter, key=counter.get)
         grank = {k: sorted(v, key=lambda e: estimate_cost(e), reverse=True)
                  for k, v in groups.items()}
 
-        def cbk_search2(expr, rank):
-            ret = []
-            for e in cbk_search(expr):
-                mapper = deindexify(e)
-                for i in rank:
-                    if i in mapper:
-                        ret.extend(mapper[i])
-                        break
-            return ret
-
         candidates = sorted(grank, reverse=True)[:2]
         for i in candidates:
             lower_pri_elems = flatten([grank[j] for j in candidates if j != i])
-            cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems)
-            yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose)
+            cbk_search = lambda e: self._cbk_search2(e, grank[i] + lower_pri_elems)
+            yield self._do_generate(exprs, exclude, cbk_search, self._cbk_compose)
 
     def _lookup_key(self, c):
         return AliasKey(c.ispace, None, c.dtype, c.guards, c.properties)
 
+    def _select(self, variants):
+        if isinstance(self.opt_schedule_strategy, int):
+            try:
+                return variants[self.opt_schedule_strategy]
+            except IndexError:
+                raise ValueError("Illegal schedule %d; "
+                                 "generated %d schedules in total"
+                                 % (self.opt_schedule_strategy, len(variants)))
+
+        return pick_best(variants, self._eval_variants_delta)
+
     def _eval_variants_delta(self, delta_flops, delta_ws):
         # If there's a greater flop reduction using fewer temporaries, no doubts
         # what's gonna be the best variant. But if the better flop reduction
         # comes at the price of using more temporaries, then we have to apply
-        # heuristics, in particular we estimate how many flops would a temporary
+        # heuristics, in particular we estimate how many flops a temporary would
         # allow to save
-        return ((delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or
-                (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or
-                (delta_flops <= 0 and delta_ws >= 0))
+        return (
+            (delta_flops >= 0 and delta_ws > 0 and delta_flops / delta_ws < 15) or
+            (delta_flops <= 0 and delta_ws < 0 and delta_flops / delta_ws > 15) or
+            (delta_flops <= 0 and delta_ws >= 0)
+        )
+
+
+class CireEvalDerivatives(CireDerivatives):
+
+    def _cbk_compose(self, expr):
+        return split_coeff(expr)[1]
+
+    def _cbk_search(self, expr):
+        if isinstance(expr, EvalDerivative) and not expr.base.is_Function:
+            return expr.args
+        else:
+            return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e)
+
+    def _cbk_search2(self, expr, rank):
+        ret = []
+        for e in self._cbk_search(expr):
+            mapper = deindexify(e)
+            for i in rank:
+                if i in mapper:
+                    ret.extend(mapper[i])
+                    break
+        return ret
+
+
+class CireIndexDerivatives(CireDerivatives):
+
+    def _cbk_compose(self, expr):
+        if expr.is_Pow:
+            return (expr,)
+        terms = []
+        for a in expr.args:
+            try:
+                if not isinstance(a.function, Weights):
+                    terms.append(a)
+            except AttributeError:
+                terms.append(a)
+        return tuple(terms)
+
+    def _cbk_search(self, expr):
+        if isinstance(expr, IndexDerivative):
+            return (expr.expr,)
+        else:
+            return flatten(e for e in [self._cbk_search(a) for a in expr.args] if e)
+
+    def _cbk_search2(self, expr, rank):
+        ret = []
+        for e in self._cbk_search(expr):
+            mapper = deindexify(e)
+            for i in rank:
+                found = [v.expr if isinstance(v, IndexDerivative) else v
+                         for v in mapper.get(i, [])]
+                if found:
+                    ret.extend(found)
+                    break
+        return ret
+
+
+# Subpass mapper
+modes = {
+    'invariants': [CireInvariantsElementary,
+                   CireInvariantsDivs],
+    'eval-derivs': [CireEvalDerivatives],   # NOTE: legacy pass
+    'index-derivs': [CireIndexDerivatives],
+}
 
 
 def collect(extracted, ispace, minstorage):
@@ -438,28 +516,9 @@ def collect(extracted, ispace, minstorage):
     for expr in extracted:
         assert not expr.is_Equality
 
-        indexeds = retrieve_indexed(expr)
-
-        bases = []
-        offsets = []
-        for i in indexeds:
-            ii = IterationInstance(i)
-            if ii.is_irregular:
-                break
-
-            base = []
-            offset = []
-            for e, ai in zip(ii, ii.aindices):
-                if q_constant(e):
-                    base.append(e)
-                else:
-                    base.append(ai)
-                    offset.append((ai, e - ai))
-            bases.append(tuple(base))
-            offsets.append(LabeledVector(offset))
-
-        if not indexeds or len(bases) == len(indexeds):
-            found.append(Candidate(expr, ispace, indexeds, bases, offsets))
+        eg = ExprGeometry(expr)
+        if eg.is_regular:
+            found.append(eg)
 
     # Create groups of aliasing expressions
     mapper = OrderedDict()
@@ -468,17 +527,13 @@ def collect(extracted, ispace, minstorage):
         c = unseen.pop(0)
         group = [c]
         for u in list(unseen):
-            # Is the arithmetic structure of `c` and `u` equivalent ?
-            if not compare_ops(c.expr, u.expr):
-                continue
-
             # Is `c` translated w.r.t. `u` ?
             if not c.translated(u):
                 continue
 
             group.append(u)
             unseen.remove(u)
-        group = Group(group)
+        group = Group(group, ispace=ispace)
 
         if minstorage:
             k = group.dimensions_translated
@@ -561,8 +616,7 @@ def collect(extracted, ispace, minstorage):
             na = g.naliases
             nr = nredundants(ispace, pivot)
             score = estimate_cost(pivot, True)*((na - 1) + nr)
-            if score > 0:
-                aliases.add(pivot, aliaseds, list(mapper), distances, score)
+            aliases.add(pivot, aliaseds, list(mapper), distances, score)
 
     return aliases
 
@@ -575,17 +629,17 @@ def choose(aliases, exprs, mapper, mingain):
     """
     aliases = AliasList(aliases)
 
-    # `score < m` => discarded
+    if not aliases:
+        return exprs, aliases
+
+    # `score <= m` => discarded
     # `score > M` => optimized
-    # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics
+    # `m <= score <= M` => maybe discarded, maybe optimized; depends on heuristics
     m = mingain
     M = mingain*3
 
-    if not aliases:
-        return exprs, aliases
-
     # Filter off the aliases with low score
-    key = lambda a: a.score >= m
+    key = lambda a: a.score > m
     aliases.filter(key)
 
     # Project the candidate aliases into `exprs` to derive the final working set
@@ -596,7 +650,7 @@ def choose(aliases, exprs, mapper, mingain):
     # Filter off the aliases with a weak flop-reduction / working-set tradeoff
     key = lambda a: \
         a.score > M or \
-        m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset)
+        m < a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset)
     aliases.filter(key)
 
     if not aliases:
@@ -688,8 +742,9 @@ def lower_aliases(aliases, meta, maxpar):
 
             # Given the iteration `interval`, lower distances to indices
             for distance, indices in zip(a.distances, indicess):
+                v = distance[interval.dim] or 0
                 try:
-                    indices.append(d - interval.lower + distance[interval.dim])
+                    indices.append(d - interval.lower + v)
                 except TypeError:
                     indices.append(d)
 
@@ -826,7 +881,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps):
         make = TempFunction
     else:
         # Typical case -- the user does *not* "see" the CIRE-created temporaries
-        make = Array
+        make = TempArray
 
     clusters = []
     subs = {}
@@ -927,29 +982,23 @@ def optimize_clusters_msds(clusters):
     return processed
 
 
-def pick_best(variants, schedule_strategy, eval_variants_delta):
+def pick_best(variants, eval_variants_delta):
     """
     Return the variant with the best trade-off between operation count
     reduction and working set increase. Heuristics may be applied.
     """
-    if type(schedule_strategy) is int:
-        try:
-            return variants[schedule_strategy]
-        except IndexError:
-            raise ValueError("Illegal schedule strategy %d; accepted `[0, %d]`"
-                             % (schedule_strategy, len(variants) - 1))
-
     best = None
     best_flops_score = None
     best_ws_score = None
 
     for i in variants:
-        i_flops_score = sum(sa.score for sa in i.schedule)
+        i_flops_score = i.schedule.score
 
         # The working set score depends on the number and dimensionality of
         # temporaries required by the Schedule
         i_ws_count = Counter([len(sa.writeto) for sa in i.schedule])
-        i_ws_score = tuple(i_ws_count[sa + 1] for sa in reversed(range(max(i_ws_count))))
+        i_ws_score = [i_ws_count[sa + 1]
+                      for sa in reversed(range(max(i_ws_count, default=0)))]
         # TODO: For now, we assume the performance impact of an N-dimensional
         # temporary is always the same regardless of the value N, but this might
         # change in the future
@@ -970,91 +1019,36 @@ def pick_best(variants, schedule_strategy, eval_variants_delta):
 # Utilities
 
 
-class Candidate(object):
-
-    def __init__(self, expr, ispace, indexeds, bases, offsets):
-        self.expr = expr
-        self.ispace = ispace
-        self.indexeds = indexeds
-        self.bases = bases
-        self.offsets = offsets
-
-    def __repr__(self):
-        return "Candidate(expr=%s)" % self.expr
-
-    def translated(self, other):
-        """
-        True if ``self`` is translated w.r.t. ``other``, False otherwise.
-
-        Examples
-        --------
-        Two candidates are translated if their bases are the same and
-        their offsets are pairwise translated.
-
-        c := A[i,j] op A[i,j+1]     -> Toffsets = {i: [0,0], j: [0,1]}
-        u := A[i+1,j] op A[i+1,j+1] -> Toffsets = {i: [1,1], j: [0,1]}
-
-        Then `c` is translated w.r.t. `u` with distance `{i: 1, j: 0}`
-        """
-        if len(self.Toffsets) != len(other.Toffsets):
-            return False
-        if len(self.bases) != len(other.bases):
-            return False
-
-        # Check the bases
-        if any(b0 != b1 for b0, b1 in zip(self.bases, other.bases)):
-            return False
-
-        # Check the offsets
-        for (d0, o0), (d1, o1) in zip(self.Toffsets, other.Toffsets):
-            if d0 is not d1:
-                return False
-
-            distance = set(o0 - o1)
-            if len(distance) != 1:
-                return False
-
-        return True
-
-    @cached_property
-    def Toffsets(self):
-        return LabeledVector.transpose(*self.offsets)
-
-    @cached_property
-    def dimensions(self):
-        return frozenset(i for i, _ in self.Toffsets)
-
-    @property
-    def shifts(self):
-        return self.ispace.intervals
-
-
 class Group(tuple):
 
     """
     A collection of aliasing expressions.
     """
 
-    def __new__(cls, items):
+    def __new__(cls, items, ispace=None):
         # Expand out the StencilDimensions, if any
         processed = []
         for c in items:
-            if not c.expr.find(StencilDimension):
+            sdims = [d for d in unbounded(c.expr) if d.is_Stencil]
+            if not sdims:
                 processed.append(c)
                 continue
 
-            for f in (sdims_min, sdims_max):
+            f0 = lambda e: minimum(e, sdims)
+            f1 = lambda e: maximum(e, sdims)
+
+            for f in (f0, f1):
                 expr = f(c.expr)
                 indexeds = [f(i) for i in c.indexeds]
                 offsets = [LabeledVector([(d, f(i)) for d, i in v.items()])
                            for v in c.offsets]
 
-                processed.append(
-                    Candidate(expr, c.ispace, indexeds, c.bases, offsets)
-                )
+                processed.append(ExprGeometry(expr, indexeds, c.bases, offsets))
 
         obj = super().__new__(cls, processed)
         obj._items = items
+        obj._ispace = ispace
+
         return obj
 
     def __repr__(self):
@@ -1094,6 +1088,8 @@ def diameter(self):
         ret = defaultdict(int)
         for i in self.Toffsets:
             for d, v in i:
+                if d not in self._ispace:
+                    continue
                 try:
                     distance = int(max(v) - min(v))
                 except TypeError:
@@ -1101,7 +1097,13 @@ def diameter(self):
                     if len(set(v)) == 1:
                         continue
                     else:
-                        raise ValueError
+                        # Worst-case scenario, we raraly end up here
+                        # Resort to the fast vector-based comparison machinery
+                        # (rather than the slower sympy.simplify)
+                        items = [Vector(i) for i in v]
+                        distance, = vmax(*items) - vmin(*items)
+                        if not is_integer(distance):
+                            raise ValueError
                 ret[d] = max(ret[d], distance)
 
         return ret
@@ -1109,13 +1111,13 @@ def diameter(self):
     @property
     def pivot(self):
         """
-        A deterministically chosen Candidate for this Group.
+        A deterministically chosen reference for this Group.
         """
         return self[0]
 
     @property
     def dimensions(self):
-        return self.pivot.dimensions
+        return frozenset(self.diameter)
 
     @property
     def dimensions_translated(self):
@@ -1125,10 +1127,11 @@ def dimensions_translated(self):
     def naliases(self):
         na = len(self._items)
 
-        sdims = set().union(*[c.expr.find(StencilDimension) for c in self._items])
-        implicit = int(np.prod([i._size for i in sdims])) - 1
+        udims = set().union(*[unbounded(c.expr) for c in self._items])
+        sdims = [d for d in udims if d.is_Stencil]
+        implicit = int(np.prod([i._size for i in sdims]))
 
-        return na + implicit
+        return na*max(implicit, 1)
 
     @cached_property
     def _pivot_legal_rotations(self):
@@ -1158,7 +1161,7 @@ def _pivot_legal_rotations(self):
     def _pivot_min_intervals(self):
         """
         The minimum Interval along each Dimension such that by evaluating the
-        pivot, all Candidates are evaluated too.
+        pivot, all other items are evaluated too.
         """
         c = self.pivot
 
@@ -1191,17 +1194,23 @@ def _pivot_legal_shifts(self):
         ret = defaultdict(lambda: (-np.inf, np.inf))
         for i, ofs in zip(c.indexeds, c.offsets):
             f = i.function
+
             for l in ofs.labels:
+                if isinstance(l, StencilDimension):
+                    continue
+
                 # `f`'s cumulative halo size along `l`
                 hsize = sum(f._size_halo[l])
 
                 # Any `ofs`'s shift due to non-[0,0] iteration space
-                lower, upper = c.shifts[l].offsets
+                lower, upper = self._ispace.intervals[l].offsets
+
+                ofs0, ofs1 = extrema(ofs[l])
 
                 try:
                     # Assume `ofs[l]` is a number (typical case)
-                    maxd = min(0, max(ret[l][0], -ofs[l] - lower))
-                    mini = max(0, min(ret[l][1], hsize - ofs[l] - upper))
+                    maxd = min(0, max(ret[l][0], -ofs0 - lower))
+                    mini = max(0, min(ret[l][1], hsize - ofs1 - upper))
 
                     ret[l] = (maxd, mini)
                 except TypeError:
@@ -1321,6 +1330,10 @@ def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False):
             is_frame=is_frame or self.is_frame
         )
 
+    @property
+    def score(self):
+        return sum(i.score for i in self)
+
 
 def make_rotations_table(d, v):
     """
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index 741c8bf558..d731fa2126 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -1,11 +1,12 @@
 from sympy import sympify
 
+from devito.finite_differences.differentiable import IndexSum
 from devito.ir.clusters import Queue
 from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT,
                                SEQUENTIAL, SKEWABLE, TILABLES, Interval,
                                IntervalGroup, IterationSpace, Scope)
 from devito.passes import is_on_device
-from devito.symbolics import uxreplace, xreplace_indices
+from devito.symbolics import search, uxreplace, xreplace_indices
 from devito.tools import UnboundedMultiTuple, as_tuple, flatten, is_integer, prod
 from devito.types import BlockDimension
 
@@ -100,6 +101,8 @@ def _has_data_reuse(self, cluster):
         # which translates into the existance of any Relation accross Indexeds
         if any(r.function.is_AbstractFunction for r in cluster.scope.r_gen()):
             return True
+        if search(cluster.exprs, IndexSum):
+            return True
 
         # If it's a reduction operation a la matrix-matrix multiply, two Indexeds
         # might be enough
@@ -314,7 +317,7 @@ def callback(self, clusters, prefix, blk_size_gen=None):
             # By passing a suitable key to `next` we ensure that we pull the
             # next par-tile entry iff we're now blocking an unseen TILABLE nest
             try:
-                step = sympify(blk_size_gen.next(clusters, d))
+                step = sympify(blk_size_gen.next(prefix, d, clusters))
             except StopIteration:
                 return clusters
         else:
@@ -418,6 +421,7 @@ class BlockSizeGenerator(object):
 
     def __init__(self, par_tile):
         self.umt = UnboundedMultiTuple(*par_tile)
+        self.tip = -1
 
         # This is for Clusters that need a small par-tile to avoid under-utilizing
         # computational resources (e.g., kernels running over iteration spaces that
@@ -430,15 +434,45 @@ def __init__(self, par_tile):
         else:
             self.umt_small = UnboundedMultiTuple(par_tile.default)
 
-    def next(self, clusters, d):
+    def next(self, prefix, d, clusters):
+        # If a whole new set of Dimensions, move the tip -- this means `clusters`
+        # at `d` represents a new loop nest or kernel
+        x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters))
+        if not x:
+            self.tip += 1
+
         # TODO: This is for now exceptionally rudimentary
         if all(c.properties.is_blockable_small(d) for c in clusters):
-            umt = self.umt_small
+            if not x:
+                self.umt_small.iter()
+            return self.umt_small.next()
+
+        if x:
+            item = self.umt.curitem
         else:
+            # We can't `self.umt.iter()` because we might still want to
+            # fallback to `self.umt_small`
+            item = self.umt.nextitem
+
+        # Handle user-provided rules
+        # TODO: This is also rudimentary
+        if item.rule is None:
             umt = self.umt
+        elif is_integer(item.rule):
+            if item.rule == self.tip:
+                umt = self.umt
+            else:
+                umt = self.umt_small
+        else:
+            if item.rule in {d.name for d in prefix.itdims}:
+                umt = self.umt
+            else:
+                # This is like "pattern unmatched" -- fallback to `umt_small`
+                umt = self.umt_small
 
-        if not any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)):
+        if not x:
             umt.iter()
+
         return umt.next()
 
 
diff --git a/devito/passes/clusters/cse.py b/devito/passes/clusters/cse.py
index 85b4e379df..5f00a0341e 100644
--- a/devito/passes/clusters/cse.py
+++ b/devito/passes/clusters/cse.py
@@ -8,6 +8,7 @@
 from devito.passes.clusters.utils import makeit_ssa
 from devito.symbolics import estimate_cost, q_leaf
 from devito.symbolics.manipulation import _uxreplace
+from devito.tools import as_list
 from devito.types import Eq, Temp as Temp0
 
 __all__ = ['cse']
@@ -52,13 +53,18 @@ def _cse(maybe_exprs, make, min_cost=1, mode='default'):
     # also ensuring some form of post-processing
     assert mode == 'default'  # Only supported mode ATM
 
-    # Just for flexibility, accept either Clusters or exprs
+    # Accept Clusters, Eqs or even just exprs
     if isinstance(maybe_exprs, Cluster):
         processed = list(maybe_exprs.exprs)
         scope = maybe_exprs.scope
     else:
-        processed = list(maybe_exprs)
-        scope = Scope(maybe_exprs)
+        maybe_exprs = as_list(maybe_exprs)
+        if all(e.is_Equality for e in maybe_exprs):
+            processed = maybe_exprs
+            scope = Scope(maybe_exprs)
+        else:
+            processed = [Eq(make(), e) for e in maybe_exprs]
+            scope = Scope([])
 
     # Some sub-expressions aren't really "common" -- that's the case of Dimension-
     # independent data dependences. For example:
diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py
index 00e7526f88..fd0d8bdcad 100644
--- a/devito/passes/clusters/derivatives.py
+++ b/devito/passes/clusters/derivatives.py
@@ -1,5 +1,5 @@
 from devito.finite_differences import IndexDerivative
-from devito.ir import Interval, IterationSpace, Queue
+from devito.ir import Backward, Forward, Interval, IterationSpace, Queue
 from devito.passes.clusters.misc import fuse
 from devito.symbolics import (retrieve_dimensions, reuse_if_untouched, q_leaf,
                               uxreplace)
@@ -87,7 +87,8 @@ def _core(expr, c, weights, mapper, sregistry):
     dims = tuple(d for d in dims if d not in c.ispace)
 
     intervals = [Interval(d) for d in dims]
-    ispace0 = IterationSpace(intervals)
+    directions = {d: Backward if d.backward else Forward for d in dims}
+    ispace0 = IterationSpace(intervals, directions=directions)
 
     extra = (c.ispace.itdimensions + dims,)
     ispace = IterationSpace.union(c.ispace, ispace0, relations=extra)
@@ -100,7 +101,9 @@ def _core(expr, c, weights, mapper, sregistry):
 
     # Transform e.g. `w[i0] -> w[i0 + 2]` for alignment with the
     # StencilDimensions starting points
-    subs = {expr.weights: expr.weights.subs(d, d - d._min) for d in dims}
+    subs = {expr.weights:
+            expr.weights.subs(d, d - (d._max if d.backward else d._min))
+            for d in dims}
     expr1 = Inc(s, uxreplace(expr.expr, subs))
     processed.append(c.rebuild(exprs=expr1, ispace=ispace))
 
diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py
index f023762402..1353bf89b3 100644
--- a/devito/passes/clusters/factorization.py
+++ b/devito/passes/clusters/factorization.py
@@ -142,13 +142,16 @@ def run(expr):
                     terms.append(i)
 
             # Collect common funcs
-            w_funcs = Add(*w_funcs, evaluate=False)
-            w_funcs = collect(w_funcs, funcs, evaluate=False)
-            try:
-                terms.extend([Mul(k, collect_const(v), evaluate=False)
-                              for k, v in w_funcs.items()])
-            except AttributeError:
-                assert w_funcs == 0
+            if len(w_funcs) > 1:
+                w_funcs = Add(*w_funcs, evaluate=False)
+                w_funcs = collect(w_funcs, funcs, evaluate=False)
+                try:
+                    terms.extend([Mul(k, collect_const(v), evaluate=False)
+                                  for k, v in w_funcs.items()])
+                except AttributeError:
+                    assert w_funcs == 0
+            else:
+                terms.extend(w_funcs)
 
             # Collect common pows
             w_pows = Add(*w_pows, evaluate=False)
diff --git a/devito/passes/clusters/misc.py b/devito/passes/clusters/misc.py
index 9d0e7463bd..3596bed4e0 100644
--- a/devito/passes/clusters/misc.py
+++ b/devito/passes/clusters/misc.py
@@ -180,7 +180,7 @@ def _key(self, c):
         if isinstance(c, Cluster):
             syncs = (c.syncs,)
         else:
-            syncs = c.syncs
+            syncs = tuple(i.syncs for i in c)
         for i in syncs:
             mapper = defaultdict(set)
             for k, v in i.items():
@@ -301,7 +301,7 @@ def choose_element(queue, scheduled):
 
         return ClusterGroup(dag.topological_sort(choose_element), prefix)
 
-    def _build_dag(self, cgroups, prefix, peeking=False):
+    def _build_dag(self, cgroups, prefix):
         """
         A DAG representing the data dependences across the ClusterGroups within
         a given scope.
@@ -348,9 +348,6 @@ def is_cross(dep):
                 elif any(scope.d_output_gen()):
                     dag.add_edge(cg0, cg1)
 
-            if peeking and dag.edges:
-                return dag
-
         return dag
 
 
diff --git a/devito/passes/equations/linearity.py b/devito/passes/equations/linearity.py
index 0950e1126f..66390f821e 100644
--- a/devito/passes/equations/linearity.py
+++ b/devito/passes/equations/linearity.py
@@ -23,10 +23,16 @@ def collect_derivatives(expressions):
         mapper = inspect(e)
 
         # E.g., 0.2*u.dx -> (0.2*u).dx
-        ep = aggregate_coeffs(e, mapper)
+        e1 = aggregate_coeffs(e, mapper)
 
         # E.g., (0.2*u).dx + (0.3*v).dx -> (0.2*u + 0.3*v).dx
-        processed.append(factorize_derivatives(ep))
+        e2 = factorize_derivatives(e1)
+        if e2 == e1:
+            # No luck, stick to `e` to preserve e.g. the original
+            # coefficient factorization
+            processed.append(e)
+        else:
+            processed.append(e2)
 
     return processed
 
diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py
index db9b446d7f..b844fd9a08 100644
--- a/devito/passes/iet/instrument.py
+++ b/devito/passes/iet/instrument.py
@@ -1,10 +1,12 @@
-from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Section,
-                           TimedList, Transformer)
+from itertools import groupby
+
+from devito.ir.iet import (BusyWait, FindNodes, FindSymbols, MapNodes, Iteration,
+                           Section, TimedList, Transformer)
 from devito.mpi.routines import (HaloUpdateCall, HaloWaitCall, MPICall, MPIList,
                                  HaloUpdateList, HaloWaitList, RemainderCall,
                                  ComputeCall)
 from devito.passes.iet.engine import iet_pass
-from devito.types import Timer
+from devito.types import TempArray, TempFunction, Timer
 
 __all__ = ['instrument']
 
@@ -25,10 +27,12 @@ def instrument(graph, **kwargs):
 @iet_pass
 def track_subsections(iet, **kwargs):
     """
-    Add custom Sections to the `profiler`. Custom Sections include:
+    Add sub-Sections to the `profiler`. Sub-Sections include:
 
         * MPI Calls (e.g., HaloUpdateCall and HaloUpdateWait)
         * Busy-waiting on While(lock) (e.g., from host-device orchestration)
+        * Multi-pass implementations -- one sub-Section for each pass, within one
+          macro Section
     """
     profiler = kwargs['profiler']
     sregistry = kwargs['sregistry']
@@ -45,6 +49,7 @@ def track_subsections(iet, **kwargs):
 
     mapper = {}
 
+    # MPI Calls, busy-waiting
     for NodeType in [MPIList, MPICall, BusyWait, ComputeCall]:
         for k, v in MapNodes(Section, NodeType).visit(iet).items():
             for i in v:
@@ -57,6 +62,37 @@ def track_subsections(iet, **kwargs):
 
     iet = Transformer(mapper).visit(iet)
 
+    # Multi-pass implementations
+    mapper = {}
+
+    for i in FindNodes(Iteration).visit(iet):
+        for k, g in groupby(i.nodes, key=lambda n: n.is_Section):
+            if not k:
+                continue
+
+            candidates = []
+            for i in g:
+                functions = FindSymbols().visit(i)
+                if any(isinstance(f, (TempArray, TempFunction)) for f in functions):
+                    candidates.append(i)
+                else:
+                    # They must be consecutive Sections
+                    break
+            if len(candidates) < 2:
+                continue
+
+            name = sregistry.make_name(prefix='multipass')
+            body = [i._rebuild(is_subsection=True) for i in candidates]
+            section = Section(name, body=body)
+
+            profiler.group_as_subsections(name, [i.name for i in candidates])
+
+            mapper[candidates.pop(0)] = section
+            for i in candidates:
+                mapper[i] = None
+
+    iet = Transformer(mapper).visit(iet)
+
     return iet, {}
 
 
@@ -100,6 +136,6 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs):
         if runs_async and not unnecessary:
             mapper[tl] = tl._rebuild(body=tl.body + (sync,))
 
-    iet = Transformer(mapper).visit(iet)
+    iet = Transformer(mapper, nested=True).visit(iet)
 
     return iet, {}
diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py
index 4199283e54..e83aeae943 100644
--- a/devito/passes/iet/misc.py
+++ b/devito/passes/iet/misc.py
@@ -5,11 +5,11 @@
 
 from devito.finite_differences import Max, Min
 from devito.ir import (Any, Forward, Iteration, List, Prodder, FindApplications,
-                       FindNodes, Transformer, Uxreplace, filter_iterations,
-                       retrieve_iteration_tree)
+                       FindNodes, FindSymbols, Transformer, Uxreplace,
+                       filter_iterations, retrieve_iteration_tree)
 from devito.passes.iet.engine import iet_pass
 from devito.symbolics import evalrel, has_integer_args
-from devito.tools import as_mapper, split
+from devito.tools import as_mapper, filter_ordered, split
 
 __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions',
            'generate_macros', 'minimize_symbols']
@@ -178,23 +178,29 @@ def minimize_symbols(iet):
 
 
 def remove_redundant_moddims(iet):
-    subs0 = {}
-    subs1 = {}
-    for n in FindNodes(Iteration).visit(iet):
-        mds = [d for d in n.uindices
-               if d.is_Modulo and d.origin is not None]
-        if not mds:
-            continue
+    key = lambda d: d.is_Modulo and d.origin is not None
+    mds = [d for d in FindSymbols('dimensions').visit(iet) if key(d)]
+    if not mds:
+        return iet
 
-        mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo)
-        for k, v in mapper.items():
-            chosen = v.pop(0)
-            subs0.update({d: chosen for d in v})
+    mapper = as_mapper(mds, key=lambda md: md.origin % md.modulo)
 
-        uindices = [d for d in n.uindices if d not in subs0]
-        subs1[n] = n._rebuild(uindices=uindices)
+    subs = {}
+    for k, v in mapper.items():
+        chosen = v.pop(0)
+        subs.update({d: chosen for d in v})
+
+    body = Uxreplace(subs).visit(iet.body)
+    iet = iet._rebuild(body=body)
+
+    # ModuloDimensions are defined in Iteration headers, hence they must be
+    # removed from there too
+    subs = {}
+    for n in FindNodes(Iteration).visit(iet):
+        if not set(n.uindices) & set(mds):
+            continue
+        subs[n] = n._rebuild(uindices=filter_ordered(n.uindices))
 
-    iet = Transformer(subs1, nested=True).visit(iet)
-    iet = Uxreplace(subs0).visit(iet)
+    iet = Transformer(subs, nested=True).visit(iet)
 
     return iet
diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py
index c47ef95bfc..af362af96a 100644
--- a/devito/symbolics/printer.py
+++ b/devito/symbolics/printer.py
@@ -225,10 +225,13 @@ def _print_TrigonometricFunction(self, expr):
             func_name += 'f'
         return '%s(%s)' % (func_name, self._print(*expr.args))
 
+    def _print_DefFunction(self, expr):
+        arguments = [self._print(i) for i in expr.arguments]
+        return "%s(%s)" % (expr.name, ','.join(arguments))
+
     def _print_Fallback(self, expr):
         return expr.__str__()
 
-    _print_DefFunction = _print_Fallback
     _print_MacroArgument = _print_Fallback
     _print_IndexedBase = _print_Fallback
     _print_IndexSum = _print_Fallback
diff --git a/devito/symbolics/queries.py b/devito/symbolics/queries.py
index ec86ae7809..c4002508cb 100644
--- a/devito/symbolics/queries.py
+++ b/devito/symbolics/queries.py
@@ -20,7 +20,8 @@
 # * Number
 # * Symbol
 # * Indexed
-extra_leaves = (FieldFromPointer, FieldFromComposite, IndexedBase, AbstractObject)
+extra_leaves = (FieldFromPointer, FieldFromComposite, IndexedBase, AbstractObject,
+                IndexedPointer)
 
 
 def q_symbol(expr):
@@ -31,7 +32,9 @@ def q_symbol(expr):
 
 
 def q_leaf(expr):
-    return expr.is_Atom or expr.is_Indexed or isinstance(expr, extra_leaves)
+    return (expr.is_Atom or
+            expr.is_Indexed or
+            isinstance(expr, extra_leaves))
 
 
 def q_indexed(expr):
@@ -51,7 +54,7 @@ def q_derivative(expr):
 def q_terminal(expr):
     return (expr.is_Symbol or
             expr.is_Indexed or
-            isinstance(expr, extra_leaves + (IndexedPointer,)))
+            isinstance(expr, extra_leaves))
 
 
 def q_routine(expr):
diff --git a/devito/symbolics/search.py b/devito/symbolics/search.py
index 94e533a692..7680bd462a 100644
--- a/devito/symbolics/search.py
+++ b/devito/symbolics/search.py
@@ -1,3 +1,5 @@
+import sympy
+
 from devito.symbolics.queries import (q_indexed, q_function, q_terminal, q_leaf,
                                       q_symbol, q_dimension, q_derivative)
 from devito.tools import as_tuple
@@ -48,7 +50,7 @@ def __init__(self, query, mode, deep=False):
         self.deep = deep
 
     def _next(self, expr):
-        if self.deep is True and expr.is_Indexed:
+        if self.deep and expr.is_Indexed:
             return expr.indices
         elif q_leaf(expr):
             return ()
@@ -110,10 +112,18 @@ def search(exprs, query, mode='unique', visit='dfs', deep=False):
 
     assert mode in Search.modes, "Unknown mode"
 
-    searcher = Search(query, mode, deep)
+    if isinstance(query, type):
+        Q = lambda obj: isinstance(obj, query)
+    else:
+        Q = query
+
+    searcher = Search(Q, mode, deep)
 
     found = Search.modes[mode]()
     for e in as_tuple(exprs):
+        if not isinstance(e, sympy.Basic):
+            continue
+
         if visit == 'dfs':
             found.update(searcher.dfs(e))
         elif visit == 'bfs':
diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py
index 10a1d90672..218e1c6c98 100644
--- a/devito/tools/data_structures.py
+++ b/devito/tools/data_structures.py
@@ -29,6 +29,13 @@ class Bunch(object):
     def __init__(self, **kwargs):
         self.__dict__.update(kwargs)
 
+    def __repr__(self):
+        return "Bunch(%s)" % ", ".join(["%s=%s" % i for i in self.__dict__.items()])
+
+    def __iter__(self):
+        for i in self.__dict__.values():
+            yield i
+
 
 class EnrichedTuple(tuple, Pickable):
 
@@ -319,11 +326,13 @@ class DAG(object):
         https://github.com/thieman/py-dag/
     """
 
-    def __init__(self, nodes=None, edges=None):
+    def __init__(self, nodes=None, edges=None, labels=None):
         self.graph = OrderedDict()
         self.labels = DefaultOrderedDict(dict)
+
         for node in as_tuple(nodes):
             self.add_node(node)
+
         for i in as_tuple(edges):
             try:
                 ind_node, dep_node = i
@@ -332,6 +341,11 @@ def __init__(self, nodes=None, edges=None):
                 self.labels[ind_node][dep_node] = label
             self.add_edge(ind_node, dep_node)
 
+        for ind_node, i in (labels or {}).items():
+            for dep_node, v in i.items():
+                if dep_node in self.graph.get(ind_node, []):
+                    self.labels[ind_node][dep_node] = v
+
     def __contains__(self, key):
         return key in self.graph
 
@@ -339,6 +353,10 @@ def __contains__(self, key):
     def nodes(self):
         return tuple(self.graph)
 
+    @property
+    def roots(self):
+        return tuple(n for n in self.nodes if not self.predecessors(n))
+
     @property
     def edges(self):
         ret = []
@@ -350,6 +368,9 @@ def edges(self):
     def size(self):
         return len(self.graph)
 
+    def clone(self):
+        return self.__class__(self.nodes, self.edges, self.labels)
+
     def add_node(self, node_name, ignore_existing=False):
         """Add a node if it does not exist yet, or error out."""
         if node_name in self.graph:
@@ -398,6 +419,24 @@ def predecessors(self, node):
         """Return a list of all predecessors of the given node."""
         return [key for key in self.graph if node in self.graph[key]]
 
+    def all_predecessors(self, node):
+        """
+        Return a list of all nodes ultimately predecessors of the given node
+        in the dependency graph, in topological order.
+        """
+        found = set()
+
+        def _all_predecessors(n):
+            if n in found:
+                return
+            found.add(n)
+            for predecessor in self.predecessors(n):
+                _all_predecessors(predecessor)
+
+        _all_predecessors(node)
+
+        return found
+
     def downstream(self, node):
         """Return a list of all nodes this node has edges towards."""
         if node not in self.graph:
@@ -493,6 +532,28 @@ def connected_components(self, enumerated=False):
         else:
             return tuple(groups)
 
+    def find_paths(self, node):
+        if node not in self.graph:
+            raise KeyError('node %s is not in graph' % node)
+
+        paths = []
+
+        def dfs(node, path):
+            path.append(node)
+
+            if not self.graph[node]:
+                paths.append(tuple(path))
+            else:
+                for child in self.graph[node]:
+                    dfs(child, path)
+
+            # Remove the node from the path to backtrack
+            path.pop()
+
+        dfs(node, [])
+
+        return tuple(paths)
+
 
 class frozendict(Mapping):
     """
@@ -575,7 +636,11 @@ def __init__(self, *items):
         nitems = []
         for i in as_tuple(items):
             if isinstance(i, Iterable):
-                nitems.append(tuple(i))
+                if isinstance(i, tuple):
+                    # Honours tuple subclasses
+                    nitems.append(i)
+                else:
+                    nitems.append(tuple(i))
             else:
                 raise ValueError("Expected sequence, got %s" % type(i))
 
@@ -589,6 +654,17 @@ def __repr__(self):
             items[self.tip] = "*%s" % items[self.tip]
         return "%s(%s)" % (self.__class__.__name__, ", ".join(items))
 
+    @property
+    def curitem(self):
+        return self.items[self.tip]
+
+    @property
+    def nextitem(self):
+        return self.items[min(self.tip + 1, max(len(self.items) - 1, 0))]
+
+    def index(self, item):
+        return self.items.index(item)
+
     def iter(self):
         if not self.items:
             raise ValueError("No tuples available")
diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py
index 9793904ac2..7a4074b370 100644
--- a/devito/tools/dtypes_lowering.py
+++ b/devito/tools/dtypes_lowering.py
@@ -75,12 +75,15 @@ def add_dtype(self, field_name, count):
 
         self.update(build_dtypes_vector([field_name], [count]))
 
-    def get_base_dtype(self, v):
+    def get_base_dtype(self, v, default=None):
         for (base_dtype, count), dtype in self.items():
             if dtype is v:
                 return base_dtype
 
-        raise ValueError
+        if default is not None:
+            return default
+        else:
+            raise ValueError
 
 
 dtypes_vector_mapper = DTypesVectorMapper()
@@ -254,8 +257,10 @@ def infer_dtype(dtypes):
           highest precision;
         * If there's at least one floating dtype, ignore any integer dtypes.
     """
-    fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)}
+    # Resolve the vector types, if any
+    dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes}
 
+    fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)}
     if len(fdtypes) > 1:
         return max(fdtypes, key=lambda i: np.dtype(i).itemsize)
     elif len(fdtypes) == 1:
diff --git a/devito/types/array.py b/devito/types/array.py
index f12922f330..f7113c19b1 100644
--- a/devito/types/array.py
+++ b/devito/types/array.py
@@ -11,7 +11,8 @@
 from devito.types.basic import AbstractFunction
 from devito.types.utils import CtypesFactory, DimensionTuple
 
-__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle']
+__all__ = ['Array', 'ArrayMapped', 'ArrayObject', 'PointerArray', 'Bundle',
+           'ComponentAccess']
 
 
 class ArrayBasic(AbstractFunction):
diff --git a/devito/types/basic.py b/devito/types/basic.py
index d7a422b39e..cc73bda6cf 100644
--- a/devito/types/basic.py
+++ b/devito/types/basic.py
@@ -1462,3 +1462,17 @@ def compare(self, other):
             if c:
                 return c
         return 0
+
+    def _subs(self, old, new, **hints):
+        # Wrap in a try to make sure no substitution happens when
+        # old is an Indexed as only checkink `old is new` would lead to
+        # incorrect substitution of `old.base` by `new`
+        try:
+            if old.is_Indexed:
+                if old.base == self.base and old.indices == self.indices:
+                    return new
+                else:
+                    return self
+        except AttributeError:
+            pass
+        return super()._subs(old, new, **hints)
diff --git a/devito/types/dimension.py b/devito/types/dimension.py
index 6626f09517..620c23e56a 100644
--- a/devito/types/dimension.py
+++ b/devito/types/dimension.py
@@ -1347,8 +1347,10 @@ class StencilDimension(BasicDimension):
     is_Stencil = True
 
     __rargs__ = BasicDimension.__rargs__ + ('_min', '_max')
+    __rkwargs__ = BasicDimension.__rkwargs__ + ('backward',)
 
-    def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs):
+    def __init_finalize__(self, name, _min, _max, spacing=None, backward=False,
+                          **kwargs):
         self._spacing = sympy.sympify(spacing) or sympy.S.One
 
         if not is_integer(_min):
@@ -1360,13 +1362,21 @@ def __init_finalize__(self, name, _min, _max, spacing=None, **kwargs):
 
         self._min = _min
         self._max = _max
+
         self._size = _max - _min + 1
 
+        self._backward = backward
+
         if self._size < 1:
             raise ValueError("Expected size greater than 0 (got %s)" % self._size)
 
     def _hashable_content(self):
-        return super()._hashable_content() + (self._min, self._max, self._spacing)
+        return (super()._hashable_content() +
+                (self._min, self._max, self._spacing, self._backward))
+
+    @cached_property
+    def backward(self):
+        return self._backward
 
     @cached_property
     def symbolic_size(self):
@@ -1513,7 +1523,7 @@ class AffineIndexAccessFunction(IndexAccessFunction):
 
         * the "main" Dimension (in practice a SpaceDimension or a TimeDimension);
         * an offset (number or symbolic expression, of dtype integer).
-        * a StencilDimension;
+        * one or more StencilDimensions;
 
     Examples
     --------
@@ -1522,39 +1532,47 @@ class AffineIndexAccessFunction(IndexAccessFunction):
     """
 
     def __new__(cls, *args, **kwargs):
+        # `args` may contain arbitrarily complicated expressions, so first of all
+        # we let SymPy simplify it, then we process the args and see if the
+        # resulting expression is indeed an AffineIndexAccessFunction
+        add = sympy.Add(*args, **kwargs)
+        if not isinstance(add, sympy.Add):
+            # E.g., reduced to a Symbol
+            return add
+
         d = 0
-        sd = 0
+        sds = []
         ofs_items = []
-        for a in args:
+        for a in add.args:
             if isinstance(a, StencilDimension):
-                if sd != 0:
-                    return sympy.Add(*args, **kwargs)
-                sd = a
+                sds.append(a)
             elif isinstance(a, Dimension):
                 d = cls._separate_dims(d, a, ofs_items)
                 if d is None:
-                    return sympy.Add(*args, **kwargs)
+                    return add
             elif isinstance(a, AffineIndexAccessFunction):
-                if sd != 0 and a.sd != 0:
-                    return sympy.Add(*args, **kwargs)
+                if sds and a.sds:
+                    return add
                 d = cls._separate_dims(d, a.d, ofs_items)
                 if d is None:
-                    return sympy.Add(*args, **kwargs)
-                sd = a.sd
+                    return add
+                sds = list(a.sds or sds)
                 ofs_items.append(a.ofs)
             else:
                 ofs_items.append(a)
 
         ofs = sympy.Add(*[i for i in ofs_items if i is not None])
         if not all(is_integer(i) or i.is_Symbol for i in ofs.free_symbols):
-            return sympy.Add(*args, **kwargs)
+            return add
+
+        sds = tuple(sds)
 
-        obj = IndexAccessFunction.__new__(cls, d, ofs, sd)
+        obj = IndexAccessFunction.__new__(cls, d, ofs, *sds)
 
         if isinstance(obj, AffineIndexAccessFunction):
             obj.d = d
             obj.ofs = ofs
-            obj.sd = sd
+            obj.sds = sds
         else:
             # E.g., SymPy simplified it to Zero or something else
             pass
@@ -1564,7 +1582,7 @@ def __new__(cls, *args, **kwargs):
     @classmethod
     def _separate_dims(cls, d0, d1, ofs_items):
         if d0 == 0 and d1 == 0:
-            return None
+            return 0
         elif d0 == 0 and isinstance(d1, Dimension):
             return d1
         elif d1 == 0 and isinstance(d0, Dimension):
diff --git a/devito/types/misc.py b/devito/types/misc.py
index c644a60614..ccbd9b7fdc 100644
--- a/devito/types/misc.py
+++ b/devito/types/misc.py
@@ -3,12 +3,12 @@
 import numpy as np
 from sympy.core.core import ordering_of_classes
 
-from devito.types import CompositeObject, Indexed, Symbol
+from devito.types import Array, CompositeObject, Indexed, Symbol
 from devito.types.basic import IndexedData
 from devito.tools import Pickable, as_tuple
 
 __all__ = ['Timer', 'Pointer', 'VolatileInt', 'FIndexed', 'Wildcard',
-           'Global', 'Hyperplane', 'Indirection', 'Temp', 'Jump']
+           'Global', 'Hyperplane', 'Indirection', 'Temp', 'TempArray', 'Jump']
 
 
 class Timer(CompositeObject):
@@ -175,8 +175,8 @@ def __new__(cls, name=None, mapped=None, dtype=np.uint64, is_const=True,
 class Temp(Symbol):
 
     """
-    A Temp is a Symbol used by compiler passes to store locally-constructed
-    temporary expressions.
+    A Temp is a Symbol used by compiler passes to store intermediate
+    sub-expressions.
     """
 
     # Just make sure the SymPy args ordering is the same regardless of whether
@@ -184,6 +184,16 @@ class Temp(Symbol):
     ordering_of_classes.insert(ordering_of_classes.index('Symbol') + 1, 'Temp')
 
 
+class TempArray(Array):
+
+    """
+    A TempArray is an Array used by compiler passes to store intermediate
+    sub-expressions.
+    """
+
+    pass
+
+
 class Jump(object):
 
     """
diff --git a/devito/types/object.py b/devito/types/object.py
index d5d20d65d9..1db973cce9 100644
--- a/devito/types/object.py
+++ b/devito/types/object.py
@@ -53,6 +53,8 @@ def __repr__(self):
     def _sympystr(self, printer):
         return str(self)
 
+    _ccode = _sympystr
+
     def _hashable_content(self):
         return (self.name, self.dtype)
 
diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb
index a3f01a55a4..1b4e29ddb9 100644
--- a/examples/performance/01_gpu.ipynb
+++ b/examples/performance/01_gpu.ipynb
@@ -253,7 +253,7 @@
      "output_type": "stream",
      "text": [
       "#define _POSIX_C_SOURCE 200809L\n",
-      "#define uL0(time, x, y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n",
+      "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n",
       "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n",
       "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n",
       "\n",
diff --git a/tests/test_derivatives.py b/tests/test_derivatives.py
index 123975d63d..8031932df5 100644
--- a/tests/test_derivatives.py
+++ b/tests/test_derivatives.py
@@ -457,6 +457,16 @@ def test_all_shortcuts(self, so):
         for fd in g._fd:
             assert getattr(g, fd)
 
+    def test_transpose_simple(self):
+        grid = Grid(shape=(4, 4))
+
+        f = TimeFunction(name='f', grid=grid, space_order=4)
+
+        assert str(f.dx.T.evaluate) == ("-0.0833333333*f(t, x - 2*h_x, y)/h_x "
+                                        "+ 0.666666667*f(t, x - h_x, y)/h_x "
+                                        "- 0.666666667*f(t, x + h_x, y)/h_x "
+                                        "+ 0.0833333333*f(t, x + 2*h_x, y)/h_x")
+
     @pytest.mark.parametrize('so', [2, 4, 8, 12])
     @pytest.mark.parametrize('ndim', [1, 2])
     @pytest.mark.parametrize('derivative, adjoint_name', [
@@ -743,6 +753,19 @@ def test_dxdy_v2(self):
         term1 = f.dxdy._evaluate(expand=False)
         assert len(term1.find(IndexDerivative)) == 2
 
+    def test_transpose(self):
+        grid = Grid(shape=(4, 4))
+        x, _ = grid.dimensions
+        h_x = x.spacing
+
+        f = TimeFunction(name='f', grid=grid, space_order=4)
+
+        term = f.dx.T._evaluate(expand=False)
+        assert isinstance(term, IndexDerivative)
+
+        i0, = term.dimensions
+        assert term.base == f.subs(x, x + i0*h_x)
+
 
 def bypass_uneval(expr):
     unevals = expr.find(EvalDerivative)
diff --git a/tests/test_dimension.py b/tests/test_dimension.py
index 7bf28faa35..701995e443 100644
--- a/tests/test_dimension.py
+++ b/tests/test_dimension.py
@@ -28,7 +28,7 @@ def test_basic(self):
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
         assert expr.ofs == 1
-        assert expr.sd == 0
+        assert expr.sds == ()
 
         s0 = Symbol(name='s0', dtype=np.int32)
         s1 = Symbol(name='s1', dtype=np.int32)
@@ -38,7 +38,7 @@ def test_basic(self):
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
         assert expr.ofs == s0 + s1 + 1
-        assert expr.sd == 0
+        assert expr.sds == ()
 
     def test_reversed(self):
         d = Dimension(name='x')
@@ -48,14 +48,14 @@ def test_reversed(self):
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
         assert expr.ofs == 1
-        assert expr.sd == 0
+        assert expr.sds == ()
 
         expr = d.symbolic_max + d
 
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
         assert expr.ofs is d.symbolic_max
-        assert expr.sd == 0
+        assert expr.sds == ()
 
     def test_non_affine(self):
         grid = Grid(shape=(3,))
@@ -76,8 +76,8 @@ def test_stencil_dim(self):
 
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
-        assert expr.sd is sd
         assert expr.ofs == 1
+        assert expr.sds == (sd,)
 
         s = Symbol(name='s')
 
@@ -85,8 +85,36 @@ def test_stencil_dim(self):
 
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
-        assert expr.sd is sd
         assert expr.ofs == 1 + s
+        assert expr.sds == (sd,)
+
+        expr = sd + 1 + d
+
+        assert isinstance(expr, AffineIndexAccessFunction)
+        assert expr.d is d
+        assert expr.ofs == 1
+        assert expr.sds == (sd,)
+
+    def test_stencil_dim_multiple(self):
+        d = Dimension(name='x')
+        sd0 = StencilDimension('i0', 0, 1)
+        sd1 = StencilDimension('i1', 0, 1)
+
+        expr = d + sd0 + sd1 + 1
+
+        assert isinstance(expr, AffineIndexAccessFunction)
+        assert expr.d is d
+        assert expr.ofs == 1
+        assert expr.sds == (sd0, sd1)
+
+        s = Symbol(name='s')
+
+        expr = sd0 + d + sd1 + 1 + s
+
+        assert isinstance(expr, AffineIndexAccessFunction)
+        assert expr.d is d
+        assert expr.ofs == 1 + s
+        assert expr.sds == (sd0, sd1)
 
     def test_sub(self):
         d = Dimension(name='x')
@@ -104,8 +132,15 @@ def test_sub(self):
 
         assert isinstance(expr, AffineIndexAccessFunction)
         assert expr.d is d
-        assert expr.sd is sd
         assert expr.ofs == -1 - s
+        assert expr.sds == (sd,)
+
+        expr = d + 1 + sd - d
+
+        assert isinstance(expr, AffineIndexAccessFunction)
+        assert expr.d == 0
+        assert expr.ofs == 1
+        assert expr.sds == (sd,)
 
 
 class TestBufferedDimension(object):
diff --git a/tests/test_dle.py b/tests/test_dle.py
index cbe19aee7e..7a6bc3496c 100644
--- a/tests/test_dle.py
+++ b/tests/test_dle.py
@@ -294,9 +294,9 @@ class TestBlockingParTile(object):
         ((16, 16, 16), ((16, 16, 16), (16, 16, 16))),
         ((32, 4, 4), ((4, 4, 32), (4, 4, 32))),
         (((16, 4, 4), (16, 16, 16)), ((4, 4, 16), (16, 16, 16))),
-        (((32, 4, 4), 1), ((4, 4, 32), (4, 4, 32))),
-        (((32, 4, 4), 1, 'tag0'), ((4, 4, 32), (4, 4, 32))),
-        ((((32, 4, 8), 1, 'tag0'), ((32, 8, 4), 2)), ((8, 4, 32), (4, 8, 32))),
+        (((32, 4, 4), None), ((4, 4, 32), (4, 4, 32))),
+        (((32, 4, 4), None, 'tag0'), ((4, 4, 32), (4, 4, 32))),
+        ((((32, 4, 8), None, 'tag0'), ((32, 8, 4), None)), ((8, 4, 32), (4, 8, 32))),
     ])
     def test_structure(self, par_tile, expected):
         grid = Grid(shape=(8, 8, 8))
@@ -344,6 +344,62 @@ def test_structure_2p5D(self):
             assert iters[0].step == par_tile[1]
             assert iters[1].step == par_tile[0]
 
+    def test_custom_rule0(self):
+        grid = Grid(shape=(8, 8, 8))
+
+        u = TimeFunction(name="u", grid=grid, space_order=4)
+        v = TimeFunction(name="v", grid=grid, space_order=4)
+
+        eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx),
+                Eq(v.forward, u.forward.dx)]
+
+        # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1,
+        # and use default for the rest!"
+        par_tile = (4, 4, 4)
+        rule = 1
+
+        op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule),
+                                                  'blockinner': True}))
+
+        # Check generated code. By having specified "1" as rule, we expect the
+        # given par-tile to be applied to the kernel with id 1
+        bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'z2_blk0'})
+        root = bns['x1_blk0']
+        iters = FindNodes(Iteration).visit(root)
+        iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1]
+        assert len(iters) == 3
+        assert all(i.step == j for i, j in zip(iters, par_tile))
+
+    def test_custom_rule1(self):
+        grid = Grid(shape=(8, 8, 8))
+        x, y, z = grid.dimensions
+
+        f = Function(name='f', grid=grid)
+        u = TimeFunction(name="u", grid=grid, space_order=4)
+        v = TimeFunction(name="v", grid=grid, space_order=4)
+
+        eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])),
+                Eq(v.forward, u.forward.dx)]
+
+        # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within
+        # the time loop, and use default for the rest!"
+        par_tile = (4, 4, 4)
+        rule = grid.time_dim.name  # We must be able to infer it from str
+
+        op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule),
+                                                  'blockinner': True,
+                                                  'blockrelax': True}))
+
+        # Check generated code. By having specified "1" as rule, we expect the
+        # given par-tile to be applied to the kernel with id 1
+        bns, _ = assert_blocking(op, {'z0_blk0', 'x1_blk0', 'x2_blk0', 'x3_blk0'})
+        for i in ['x1_blk0', 'x2_blk0', 'x3_blk0']:
+            root = bns[i]
+            iters = FindNodes(Iteration).visit(root)
+            iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1]
+            assert len(iters) == 3
+            assert all(i.step == j for i, j in zip(iters, par_tile))
+
 
 @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)])
 @pytest.mark.parametrize("time_order", [2])
@@ -1028,6 +1084,7 @@ def test_multiple_subnests_v0(self):
                                _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.)
         op = Operator(eqn, opt=('advanced', {'openmp': True,
                                              'cire-mingain': 0,
+                                             'cire-schedule': 1,
                                              'par-nested': 0,
                                              'par-collapse-ncores': 1,
                                              'par-dynamic-work': 0}))
@@ -1063,6 +1120,7 @@ def test_multiple_subnests_v1(self):
                                _R(u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3.*f) + 1.)
         op = Operator(eqn, opt=('advanced', {'openmp': True,
                                              'cire-mingain': 0,
+                                             'cire-schedule': 1,
                                              'cire-rotate': True,
                                              'par-nested': 0,
                                              'par-collapse-ncores': 1,
diff --git a/tests/test_dse.py b/tests/test_dse.py
index b346e00092..9473068cbc 100644
--- a/tests/test_dse.py
+++ b/tests/test_dse.py
@@ -2,21 +2,27 @@
 import pytest
 from cached_property import cached_property
 
+from sympy import Mul  # noqa
+
 from conftest import (skipif, EVAL, _R, assert_structure, assert_blocking,  # noqa
                       get_params, get_arrays, check_array)
-from devito import (NODE, Eq, Inc, Constant, Function, TimeFunction, SparseTimeFunction,  # noqa
-                    Dimension, SubDimension, ConditionalDimension, DefaultDimension, Grid,
-                    Operator, norm, grad, div, dimensions, switchconfig, configuration,
-                    centered, first_derivative, solve, transpose, Abs, cos, sin, sqrt)
+from devito import (NODE, Eq, Inc, Constant, Function, TimeFunction,  # noqa
+                    SparseTimeFunction, Dimension, SubDimension,
+                    ConditionalDimension, DefaultDimension, Grid, Operator,
+                    norm, grad, div, dimensions, switchconfig, configuration,
+                    centered, first_derivative, solve, transpose, Abs, cos,
+                    sin, sqrt)
 from devito.exceptions import InvalidArgument, InvalidOperator
 from devito.finite_differences.differentiable import diffify
 from devito.ir import (Conditional, DummyEq, Expression, Iteration, FindNodes,
                        FindSymbols, ParallelIteration, retrieve_iteration_tree)
 from devito.passes.clusters.aliases import collect
+from devito.passes.clusters.factorization import collect_nested
 from devito.passes.clusters.cse import Temp, _cse
 from devito.passes.iet.parpragma import VExpanded
 from devito.symbolics import (INT, FLOAT, DefFunction, FieldFromPointer,  # noqa
-                              Keyword, SizeOf, estimate_cost, pow_to_mul, indexify)
+                              IndexedPointer, Keyword, SizeOf, estimate_cost,
+                              pow_to_mul, indexify)
 from devito.tools import as_tuple, generator
 from devito.types import Array, Scalar, Symbol
 
@@ -161,6 +167,9 @@ def test_cse(exprs, expected, min_cost):
     ('fa[x]**(-s)', 'fa[x]**(-s)'),
     ('-2/(s**2)', '-2/(s*s)'),
     ('-fa[x]', '-fa[x]'),
+    ('Mul(SizeOf("char"), '
+     '-IndexedPointer(FieldFromPointer("size", fa._C_symbol), x), evaluate=False)',
+     'sizeof(char)*(-fa_vec->size[x])'),
 ])
 def test_pow_to_mul(expr, expected):
     grid = Grid((4, 5))
@@ -173,6 +182,19 @@ def test_pow_to_mul(expr, expected):
     assert str(pow_to_mul(eval(expr))) == expected
 
 
+@pytest.mark.parametrize('expr,expected', [
+    ('s - SizeOf("int")*fa[x]', 's - fa[x]*sizeof(int)'),
+])
+def test_factorize(expr, expected):
+    grid = Grid((4, 5))
+    x, y = grid.dimensions
+
+    s = Scalar(name='s')  # noqa
+    fa = Function(name='fa', grid=grid, dimensions=(x,), shape=(4,))  # noqa
+
+    assert str(collect_nested(eval(expr))) == expected
+
+
 @pytest.mark.parametrize('expr,expected,estimate', [
     ('Eq(t0, 3)', 0, False),
     ('Eq(t0, 4.5)', 0, False),
@@ -505,6 +527,7 @@ def test_collection(self, exprs, expected):
         ispace = exprs[0].ispace
 
         aliases = collect(extracted, ispace, False)
+        aliases.filter(lambda a: a.score > 0)
 
         assert len(aliases) == len(expected)
         assert all(i.pivot in expected for i in aliases)
@@ -2211,7 +2234,8 @@ def test_derivatives_from_different_levels(self):
 
         eqn = Eq(v.forward, f*(1. + v).dx + 2.*f*((1. + v).dx + f))
 
-        op = Operator(eqn, opt=('advanced', {'cire-mingain': 0}))
+        op = Operator(eqn, opt=('advanced', {'cire-mingain': 0,
+                                             'cire-schedule': 0}))
 
         # Check code generation
         assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == 1
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
index a93d280fc7..30e4c45639 100644
--- a/tests/test_gpu_common.py
+++ b/tests/test_gpu_common.py
@@ -845,8 +845,8 @@ def test_tasking_over_compiler_generated(self):
             assert len(retrieve_iteration_tree(op)) == 5
             assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1
             sections = FindNodes(Section).visit(op)
-            assert len(sections) == 3
-            assert 'while(lock0[t1] == 0)' in str(sections[1].body[0].body[0].body[0])
+            assert len(sections) == 4
+            assert 'while(lock0[t1] == 0)' in str(sections[2].body[0].body[0].body[0])
 
         op0.apply(time_M=nt-1)
         op1.apply(time_M=nt-1, u=u1, usave=usave1)
diff --git a/tests/test_linearize.py b/tests/test_linearize.py
index 45347bf605..efed9f79db 100644
--- a/tests/test_linearize.py
+++ b/tests/test_linearize.py
@@ -479,8 +479,8 @@ def test_issue_1838():
     op1.apply(time_M=3, dt=1., p0=p1)
 
     # Check generated code
-    assert "r0L0(x, y, z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1)
-    assert "r4L0(x, y, z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1)
+    assert "r0L0(x,y,z) r0[(x)*y_stride1 + (y)*z_stride1 + (z)]" in str(op1)
+    assert "r4L0(x,y,z) r4[(x)*y_stride2 + (y)*z_stride1 + (z)]" in str(op1)
 
     assert np.allclose(p0.data, p1.data, rtol=1e-6)
 
diff --git a/tests/test_mpi.py b/tests/test_mpi.py
index 8639a435e4..bf49ba8014 100644
--- a/tests/test_mpi.py
+++ b/tests/test_mpi.py
@@ -2079,7 +2079,8 @@ def test_cire(self):
         eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+1, y+1])*3.*f +
                                _R(u[t, x+2, y+2] + u[t, x+3, y+3])*3.*f) + 1.)
         op0 = Operator(eqn, opt='noop')
-        op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0}))
+        op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0,
+                                              'cire-schedule': 1}))
 
         assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1
 
@@ -2112,7 +2113,8 @@ def test_cire_with_shifted_diagonal_halo_touch(self):
         eqn = Eq(u.forward, _R(_R(u[t, x, y] + u[t, x+2, y])*3.*f +
                                _R(u[t, x+1, y+1] + u[t, x+3, y+1])*3.*f) + 1.)
         op0 = Operator(eqn, opt='noop')
-        op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0}))
+        op1 = Operator(eqn, opt=('advanced', {'cire-mingain': 0,
+                                              'cire-schedule': 1}))
 
         assert len([i for i in FindSymbols().visit(op1.body) if i.is_Array]) == 1
 
diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py
index 27d40c93c2..b957a058c4 100644
--- a/tests/test_symbolics.py
+++ b/tests/test_symbolics.py
@@ -292,6 +292,18 @@ class BarCast(Cast):
     assert v != v1
 
 
+def test_findexed():
+    grid = Grid(shape=(3, 3, 3))
+    f = Function(name='f', grid=grid)
+
+    fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2))
+    new_fi = fi.func(strides=(3, 4))
+
+    assert new_fi.name == fi.name == 'f'
+    assert new_fi.indices == fi.indices
+    assert new_fi.strides == (3, 4)
+
+
 def test_symbolic_printing():
     b = Symbol('b')
 
@@ -304,17 +316,11 @@ class MyLocalObject(LocalObject, Expr):
     lo = MyLocalObject(name='lo')
     assert str(lo + 2) == '2 + lo'
 
-
-def test_findexed():
-    grid = Grid(shape=(3, 3, 3))
-    f = Function(name='f', grid=grid)
-
+    grid = Grid((10,))
+    f = Function(name="f", grid=grid)
     fi = FIndexed.from_indexed(f.indexify(), "foo", strides=(1, 2))
-    new_fi = fi.func(strides=(3, 4))
-
-    assert new_fi.name == fi.name == 'f'
-    assert new_fi.indices == fi.indices
-    assert new_fi.strides == (3, 4)
+    df = DefFunction('aaa', arguments=[fi])
+    assert ccode(df) == 'aaa(foo(x))'
 
 
 def test_is_on_grid():
diff --git a/tests/test_unexpansion.py b/tests/test_unexpansion.py
index d9d01b6fc9..7d0391a85a 100644
--- a/tests/test_unexpansion.py
+++ b/tests/test_unexpansion.py
@@ -5,7 +5,23 @@
 from devito.types import Symbol
 
 
-class TestBasic(object):
+class TestLoopScheduling(object):
+
+    def test_backward_dt2(self):
+        grid = Grid(shape=(4, 4))
+
+        f = Function(name='f', grid=grid)
+        v = TimeFunction(name='v', grid=grid, time_order=2)
+
+        eqns = [Eq(v.backward, v + 1.),
+                Eq(f, v.dt2)]
+
+        op = Operator(eqns, opt=('advanced', {'openmp': True,
+                                              'expand': False}))
+        assert_structure(op, ['t,x,y'], 't,x,y')
+
+
+class Test1Pass(object):
 
     def test_v0(self):
         grid = Grid(shape=(10, 10, 10))
@@ -85,7 +101,8 @@ def test_v2(self):
 
         op0 = Operator(eqns)
         op1 = Operator(eqns, opt=('advanced', {'expand': False,
-                                               'blocklevels': 0}))
+                                               'blocklevels': 0,
+                                               'cire-mingain': 200}))
 
         # Check generated code -- expect maximal fusion!
         assert_structure(op1,
@@ -110,7 +127,8 @@ def test_v3(self):
                 Eq(v.forward, (v + u.dx.dy + 1.))]
 
         op0 = Operator(eqns)
-        op1 = Operator(eqns, opt=('advanced', {'expand': False}))
+        op1 = Operator(eqns, opt=('advanced', {'expand': False,
+                                               'cire-mingain': 200}))
 
         # Check generated code -- redundant IndexDerivatives have been caught!
         op1._profiler._sections['section0'].sops == 65
@@ -123,63 +141,12 @@ def test_v3(self):
 
     def test_v4(self):
         grid = Grid(shape=(16, 16, 16))
-        t = grid.stepping_dim
-        x, y, z = grid.dimensions
-
-        so = 4
-
-        a = Function(name='a', grid=grid, space_order=so)
-        f = Function(name='f', grid=grid, space_order=so)
-        e = Function(name='e', grid=grid, space_order=so)
-        r = Function(name='r', grid=grid, space_order=so)
-        p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so)
-        m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so)
-
-        def g1(field, r, e):
-            return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) +
-                    cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) -
-                    sin(e) * field.dz(x0=z+z.spacing/2))
-
-        def g2(field, r, e):
-            return - (sin(r) * field.dx(x0=x+x.spacing/2) -
-                      cos(r) * field.dy(x0=y+y.spacing/2))
-
-        def g3(field, r, e):
-            return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) +
-                    sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) +
-                    cos(e) * field.dz(x0=z+z.spacing/2))
-
-        def g1_tilde(field, r, e):
-            return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) +
-                    (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) -
-                    (sin(e) * field).dz(x0=z-z.spacing/2))
-
-        def g2_tilde(field, r, e):
-            return - ((sin(r) * field).dx(x0=x-x.spacing/2) -
-                      (cos(r) * field).dy(x0=y-y.spacing/2))
-
-        def g3_tilde(field, r, e):
-            return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) +
-                    (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) +
-                    (cos(e) * field).dz(x0=z-z.spacing/2))
-
-        update_p = t.spacing**2 * a**2 / f * \
-            (g1_tilde(f * g1(p0, r, e), r, e) +
-             g2_tilde(f * g2(p0, r, e), r, e) +
-             g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \
-            (2 - t.spacing * a)
-
-        update_m = t.spacing**2 * a**2 / f * \
-            (g1_tilde(f * g1(m0, r, e), r, e) +
-             g2_tilde(f * g2(m0, r, e), r, e) +
-             g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \
-            (2 - t.spacing * a)
-
-        eqns = [Eq(p0.forward, update_p),
-                Eq(m0.forward, update_m)]
+
+        eqns = tti_sa_eqns(grid)
 
         op = Operator(eqns, subs=grid.spacing_map,
-                      opt=('advanced', {'expand': False}))
+                      opt=('advanced', {'expand': False,
+                                        'cire-mingain': 400}))
 
         # Check code generation
         assert op._profiler._sections['section1'].sops == 1442
@@ -203,7 +170,8 @@ def test_v5(self):
                 Eq(m0.forward, m0.dx.dx + m0.backward)]
 
         op = Operator(eqns, subs=grid.spacing_map,
-                      opt=('advanced', {'expand': False}))
+                      opt=('advanced', {'expand': False,
+                                        'cire-mingain': 200}))
 
         # Check code generation
         assert op._profiler._sections['section0'].sops == 127
@@ -227,10 +195,145 @@ def test_v6(self):
                 Eq(m0.forward, (m0.dx + s0).dx + f*m0.backward)]
 
         op = Operator(eqns, subs=grid.spacing_map,
-                      opt=('advanced', {'expand': False}))
+                      opt=('advanced', {'expand': False,
+                                        'cire-mingain': 200}))
 
         # Check code generation
         assert op._profiler._sections['section0'].sops == 133
         assert_structure(op, ['t,x,y', 't,x,y,i1', 't,x,y,i1,i0'], 't,x,y,i1,i0')
 
         op.cfunction
+
+    def test_transpose(self):
+        shape = (10, 10, 10)
+        grid = Grid(shape=shape)
+        x, _, _ = grid.dimensions
+
+        u = TimeFunction(name='u', grid=grid, space_order=4)
+        u1 = TimeFunction(name='u', grid=grid, space_order=4)
+
+        # Chessboard-like init
+        u.data[:] = np.indices(shape).sum(axis=0) % 10 + 1
+        u1.data[:] = np.indices(shape).sum(axis=0) % 10 + 1
+
+        eqn = Eq(u.forward, u.dx(x0=x+x.spacing/2).T + 1.)
+
+        op0 = Operator(eqn)
+        op1 = Operator(eqn, opt=('advanced', {'expand': False}))
+
+        op0.apply(time_M=10)
+        op1.apply(time_M=10, u=u1)
+
+        assert np.allclose(u.data, u1.data, rtol=10e-6)
+
+
+class Test2Pass(object):
+
+    def test_v0(self):
+        grid = Grid(shape=(10, 10, 10))
+
+        u = TimeFunction(name='u', grid=grid, space_order=4)
+        v = TimeFunction(name='v', grid=grid, space_order=4)
+        u1 = TimeFunction(name='u', grid=grid, space_order=4)
+        v1 = TimeFunction(name='v', grid=grid, space_order=4)
+
+        eqns = [Eq(u.forward, (u.dx.dy + v*u + 1.)),
+                Eq(v.forward, (v + u.dx.dy + 1.))]
+
+        op0 = Operator(eqns)
+        op1 = Operator(eqns, opt=('advanced', {'expand': False,
+                                               'openmp': True}))
+
+        # Check generated code
+        op1._profiler._sections['section0'].sops == 65
+        assert_structure(op1, ['t',
+                               't,x0_blk0,y0_blk0,x,y,z',
+                               't,x0_blk0,y0_blk0,x,y,z,i0',
+                               't,x0_blk0,y0_blk0,x,y,z',
+                               't,x0_blk0,y0_blk0,x,y,z,i1'],
+                         't,x0_blk0,y0_blk0,x,y,z,i0,y,z,i1')
+
+        op0.apply(time_M=5)
+        op1.apply(time_M=5, u=u1, v=v1)
+
+        assert np.allclose(u.data, u1.data, rtol=10e-3)
+        assert np.allclose(v.data, v1.data, rtol=10e-3)
+
+    def test_v1(self):
+        grid = Grid(shape=(16, 16, 16))
+
+        eqns = tti_sa_eqns(grid)
+
+        op = Operator(eqns, subs=grid.spacing_map,
+                      opt=('advanced', {'expand': False,
+                                        'openmp': False}))
+
+        # Check code generation
+        assert op._profiler._sections['section1'].sops == 190
+        assert_structure(op, ['x,y,z',
+                              't,x0_blk0,y0_blk0,x,y,z',
+                              't,x0_blk0,y0_blk0,x,y,z,i0',
+                              't,x0_blk0,y0_blk0,x,y,z',
+                              't,x0_blk0,y0_blk0,x,y,z,i1'],
+                         'x,y,z,t,x0_blk0,y0_blk0,x,y,z,i0,x,y,z,i1')
+
+        op.cfunction
+
+
+def tti_sa_eqns(grid):
+    t = grid.stepping_dim
+    x, y, z = grid.dimensions
+
+    so = 4
+
+    a = Function(name='a', grid=grid, space_order=so)
+    f = Function(name='f', grid=grid, space_order=so)
+    e = Function(name='e', grid=grid, space_order=so)
+    r = Function(name='r', grid=grid, space_order=so)
+    p0 = TimeFunction(name='p0', grid=grid, time_order=2, space_order=so)
+    m0 = TimeFunction(name='m0', grid=grid, time_order=2, space_order=so)
+
+    def g1(field, r, e):
+        return (cos(e) * cos(r) * field.dx(x0=x+x.spacing/2) +
+                cos(e) * sin(r) * field.dy(x0=y+y.spacing/2) -
+                sin(e) * field.dz(x0=z+z.spacing/2))
+
+    def g2(field, r, e):
+        return - (sin(r) * field.dx(x0=x+x.spacing/2) -
+                  cos(r) * field.dy(x0=y+y.spacing/2))
+
+    def g3(field, r, e):
+        return (sin(e) * cos(r) * field.dx(x0=x+x.spacing/2) +
+                sin(e) * sin(r) * field.dy(x0=y+y.spacing/2) +
+                cos(e) * field.dz(x0=z+z.spacing/2))
+
+    def g1_tilde(field, r, e):
+        return ((cos(e) * cos(r) * field).dx(x0=x-x.spacing/2) +
+                (cos(e) * sin(r) * field).dy(x0=y-y.spacing/2) -
+                (sin(e) * field).dz(x0=z-z.spacing/2))
+
+    def g2_tilde(field, r, e):
+        return - ((sin(r) * field).dx(x0=x-x.spacing/2) -
+                  (cos(r) * field).dy(x0=y-y.spacing/2))
+
+    def g3_tilde(field, r, e):
+        return ((sin(e) * cos(r) * field).dx(x0=x-x.spacing/2) +
+                (sin(e) * sin(r) * field).dy(x0=y-y.spacing/2) +
+                (cos(e) * field).dz(x0=z-z.spacing/2))
+
+    update_p = t.spacing**2 * a**2 / f * \
+        (g1_tilde(f * g1(p0, r, e), r, e) +
+         g2_tilde(f * g2(p0, r, e), r, e) +
+         g3_tilde(f * g3(p0, r, e) + f * g3(m0, r, e), r, e)) + \
+        (2 - t.spacing * a)
+
+    update_m = t.spacing**2 * a**2 / f * \
+        (g1_tilde(f * g1(m0, r, e), r, e) +
+         g2_tilde(f * g2(m0, r, e), r, e) +
+         g3_tilde(f * g3(m0, r, e) + f * g3(p0, r, e), r, e)) + \
+        (2 - t.spacing * a)
+
+    eqns = [Eq(p0.forward, update_p),
+            Eq(m0.forward, update_m)]
+
+    return eqns