diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py
index 2acccba648..91e68fc02b 100644
--- a/devito/passes/iet/langbase.py
+++ b/devito/passes/iet/langbase.py
@@ -214,9 +214,6 @@ def DeviceIteration(self):
     def Prodder(self):
         return self.lang.Prodder
 
-    def _device_pointers(self, *args, **kwargs):
-        return {}
-
 
 class DeviceAwareMixin(object):
 
diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py
index b6476192b2..29ba9a986b 100644
--- a/devito/passes/iet/parpragma.py
+++ b/devito/passes/iet/parpragma.py
@@ -1,3 +1,5 @@
+from itertools import takewhile
+
 import numpy as np
 import cgen as c
 from cached_property import cached_property
@@ -254,6 +256,46 @@ def nthreads_nonaffine(self):
     def threadid(self):
         return self.sregistry.threadid
 
+    def _score_candidate(self, n0, root, collapsable=()):
+        """
+        The score of a collapsable nest depends on the number of fully-parallel
+        Iterations and their position in the nest (the outer, the better).
+        """
+        nest = [root] + list(collapsable)
+        n = len(nest)
+
+        # Number of fully-parallel collapsable Iterations
+        key = lambda i: i.is_ParallelNoAtomic
+        fp_iters = list(takewhile(key, nest))
+        n_fp_iters = len(fp_iters)
+
+        # Number of parallel-if-atomic collapsable Iterations
+        key = lambda i: i.is_ParallelAtomic
+        pia_iters = list(takewhile(key, nest))
+        n_pia_iters = len(pia_iters)
+
+        # Prioritize the Dimensions that are more likely to define larger
+        # iteration spaces
+        key = lambda d: (not d.is_Derived or
+                         (d.is_Custom and not is_integer(d.symbolic_size)) or
+                         (d.is_Block and d._depth == 1))
+
+        fpdims = [i.dim for i in fp_iters]
+        n_fp_iters_large = len([d for d in fpdims if key(d)])
+
+        piadims = [i.dim for i in pia_iters]
+        n_pia_iters_large = len([d for d in piadims if key(d)])
+
+        return (
+            int(n_fp_iters == n),  # Fully-parallel nest
+            n_fp_iters_large,
+            n_fp_iters,
+            n_pia_iters_large,
+            n_pia_iters,
+            -(n0 + 1),  # The outer, the better
+            n,
+        )
+
     def _select_candidates(self, candidates):
         assert candidates
 
@@ -263,15 +305,18 @@ def _select_candidates(self, candidates):
         mapper = {}
         for n0, root in enumerate(candidates):
 
+            # Score `root` in isolation
+            mapper[(root, ())] = self._score_candidate(n0, root)
+
             collapsable = []
             for n, i in enumerate(candidates[n0+1:], n0+1):
                 # The Iteration nest [root, ..., i] must be perfect
                 if not IsPerfectIteration(depth=i).visit(root):
                     break
 
-                # Loops are collapsable only if none of the iteration variables appear
-                # in initializer expressions. For example, the following two loops
-                # cannot be collapsed
+                # Loops are collapsable only if none of the iteration variables
+                # appear in initializer expressions. For example, the following
+                # two loops cannot be collapsed
                 #
                 # for (i = ... )
                 #   for (j = i ...)
@@ -281,7 +326,7 @@ def _select_candidates(self, candidates):
                 if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]):
                     break
 
-                # Also, we do not want to collapse SIMD-vectorized Iterations
+                # Can't collapse SIMD-vectorized Iterations
                 if i.is_Vectorized:
                     break
 
@@ -297,17 +342,9 @@ def _select_candidates(self, candidates):
 
                 collapsable.append(i)
 
-            # Give a score to this candidate, based on the number of fully-parallel
-            # Iterations and their position (i.e. outermost to innermost) in the nest
-            score = (
-                int(root.is_ParallelNoAtomic),
-                len(self._device_pointers(root)),  # Outermost offloadable
-                int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
-                int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
-                -(n0 + 1)  # The outermost, the better
-            )
-
-            mapper[(root, tuple(collapsable))] = score
+                # Score `root + collapsable`
+                v = tuple(collapsable)
+                mapper[(root, v)] = self._score_candidate(n0, root, v)
 
         # Retrieve the candidates with highest score
         root, collapsable = max(mapper, key=mapper.get)
@@ -576,6 +613,13 @@ def __init__(self, sregistry, options, platform, compiler):
         self.par_tile = UnboundTuple(options['par-tile'])
         self.par_disabled = options['par-disabled']
 
+    def _score_candidate(self, n0, root, collapsable=()):
+        # `ndptrs`, the number of device pointers, part of the score too to
+        # ensure the outermost loop is offloaded
+        ndptrs = len(self._device_pointers(root))
+
+        return (ndptrs,) + super()._score_candidate(n0, root, collapsable)
+
     def _make_threaded_prodders(self, partree):
         if isinstance(partree.root, self.DeviceIteration):
             # no-op for now
diff --git a/tests/test_adjoint.py b/tests/test_adjoint.py
index 473e484c0e..759b86a3ae 100644
--- a/tests/test_adjoint.py
+++ b/tests/test_adjoint.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from devito import Operator, norm, Function, Grid, SparseFunction
+from devito import Operator, norm, Function, Grid, SparseFunction, inner
 from devito.logger import info
 from examples.seismic import demo_model, Receiver
 from examples.seismic.acoustic import acoustic_setup
@@ -114,7 +114,7 @@ def test_adjoint_F(self, mkey, shape, kernel, space_order, time_order, setup_fun
         solver.adjoint(rec=rec, srca=srca)
 
         # Adjoint test: Verify <Ax,y> matches  <x, A^Ty> closely
-        term1 = np.dot(srca.data.reshape(-1), solver.geometry.src.data)
+        term1 = inner(srca, solver.geometry.src)
         term2 = norm(rec) ** 2
         info('<x, A^Ty>: %f, <Ax,y>: %f, difference: %4.4e, ratio: %f'
              % (term1, term2, (term1 - term2)/term1, term1 / term2))
@@ -231,6 +231,6 @@ def test_adjoint_inject_interpolate(self, shape, coords, npoints=19):
         # y => p
         # x => c
         # P^T y => a
-        term1 = np.dot(p2.data.reshape(-1), p.data.reshape(-1))
-        term2 = np.dot(c.data.reshape(-1), a.data.reshape(-1))
+        term1 = inner(p2, p)
+        term2 = inner(c, a)
         assert np.isclose((term1-term2) / term1, 0., atol=1.e-6)
diff --git a/tests/test_dle.py b/tests/test_dle.py
index ec24128983..03c0b533c9 100644
--- a/tests/test_dle.py
+++ b/tests/test_dle.py
@@ -284,7 +284,7 @@ def test_cache_blocking_structure_optrelax_prec_inject():
                                           'openmp': True,
                                           'par-collapse-ncores': 1}))
 
-    assert_structure(op, ['t,p_s0_blk0,p_s', 't,p_s0_blk0,p_s,rsx,rsy'],
+    assert_structure(op, ['t', 't,p_s0_blk0,p_s,rsx,rsy'],
                      't,p_s0_blk0,p_s,rsx,rsy')
 
 
@@ -863,9 +863,9 @@ def test_incs_no_atomic(self):
         op0 = Operator(Inc(uf, 1), opt=('advanced', {'openmp': True,
                                                      'par-collapse-ncores': 1,
                                                      'par-collapse-work': 0}))
-
-        assert 'collapse(3)' in str(op0)
-        assert 'atomic' in str(op0)
+        assert 'omp for schedule' in str(op0)
+        assert 'collapse' not in str(op0)
+        assert 'atomic' not in str(op0)
 
         # Now only `x` is parallelized
         op1 = Operator([Eq(v[t, x, 0, 0], v[t, x, 0, 0] + 1), Inc(uf, 1)],
@@ -875,6 +875,46 @@ def test_incs_no_atomic(self):
         assert 'collapse' not in str(op1)
         assert 'atomic' not in str(op1)
 
+    def test_incr_perfect_outer(self):
+        grid = Grid((5, 5))
+        d = Dimension(name="d")
+
+        u = Function(name="u", dimensions=(*grid.dimensions, d),
+                     grid=grid, shape=(*grid.shape, 5), )
+        v = Function(name="v", dimensions=(*grid.dimensions, d),
+                     grid=grid, shape=(*grid.shape, 5))
+        w = Function(name="w", grid=grid)
+
+        u.data.fill(1)
+        v.data.fill(2)
+
+        summation = Inc(w, u*v)
+
+        op = Operator([summation], opt=('advanced', {'openmp': True}))
+        assert 'reduction' not in str(op)
+        assert 'omp for' in str(op)
+
+        op()
+        assert np.all(w.data == 10)
+
+    def test_incr_perfect_sparse_outer(self):
+        grid = Grid(shape=(3, 3, 3))
+
+        u = TimeFunction(name='u', grid=grid)
+        s = SparseTimeFunction(name='u', grid=grid, npoint=1, nt=11)
+
+        eqns = s.inject(u, expr=s)
+
+        op = Operator(eqns, opt=('advanced', {'par-collapse-ncores': 0,
+                                              'openmp': True}))
+
+        iters = FindNodes(Iteration).visit(op)
+        assert len(iters) == 5
+        assert iters[0].is_Sequential
+        assert all(i.is_ParallelAtomic for i in iters[1:])
+        assert iters[1].pragmas[0].value == 'omp for schedule(dynamic,chunk_size)'
+        assert all(not i.pragmas for i in iters[2:])
+
     @pytest.mark.parametrize('exprs,simd_level,expected', [
         (['Eq(y.symbolic_max, g[0, x], implicit_dims=(t, x))',
          'Inc(h1[0, 0], 1, implicit_dims=(t, x, y))'],
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
index 031bd9181b..071005464d 100644
--- a/tests/test_gpu_common.py
+++ b/tests/test_gpu_common.py
@@ -6,9 +6,9 @@
 
 from conftest import assert_structure
 from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension,
-                    MatrixSparseTimeFunction, SparseTimeFunction, SubDimension,
-                    SubDomain, SubDomainSet, TimeFunction, Operator, configuration,
-                    switchconfig)
+                    Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
+                    SubDimension, SubDomain, SubDomainSet, TimeFunction,
+                    Operator, configuration, switchconfig)
 from devito.arch import get_gpu_info
 from devito.exceptions import InvalidArgument
 from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
@@ -110,6 +110,30 @@ def test_fission(self):
         assert np.all(usave.data[5:] == expected[5:])
         assert np.all(vsave.data[:5] == expected[:5])
 
+    def test_incr_perfect_outer(self):
+        grid = Grid((5, 5))
+        d = Dimension(name="d")
+
+        u = Function(name="u", dimensions=(*grid.dimensions, d),
+                     grid=grid, shape=(*grid.shape, 5), )
+        v = Function(name="v", dimensions=(*grid.dimensions, d),
+                     grid=grid, shape=(*grid.shape, 5))
+        w = Function(name="w", grid=grid)
+
+        u.data.fill(1)
+        v.data.fill(2)
+
+        summation = Inc(w, u*v)
+
+        op = Operator([summation])
+
+        assert 'reduction' not in str(op)
+        assert 'collapse(2)' in str(op)
+        assert 'parallel' in str(op)
+
+        op()
+        assert np.all(w.data == 10)
+
 
 class Bundle(SubDomain):
     """