diff --git a/Makefile b/Makefile
index f032a10b..760985fe 100644
--- a/Makefile
+++ b/Makefile
@@ -221,8 +221,8 @@ api-tests: compiler-api
 
 all-tests: compiler-api
 	$(MAKE) tuple-test
-	$(MAKE) combo-api-tests
 	$(YK_MAKE) $@
+	$(MAKE) combo-api-tests
 
 all:
 	$(MAKE) realclean
diff --git a/docs/YASK-intro.pdf b/docs/YASK-intro.pdf
old mode 100755
new mode 100644
index 67b28457..584c1026
Binary files a/docs/YASK-intro.pdf and b/docs/YASK-intro.pdf differ
diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index a8876243..d05a50bb 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.10.00";
+    const string version = "2.10.02";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 42803aba..f08766b8 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -762,7 +762,7 @@ py-api-no-yc:
 
 # Validation runs for each binary.
 val1	:=	-dt 2 -b 16 -d 48
-val2	:=	-dt 2 -b 24 -r 32 -rt 2 -dx 63 -dy 49 -dz 47
+val2	:=	-dt 2 -b 24 -r 32 -rt 2 -d 63
 ranks	:=	2
 
 # Run the kernel binary using several combos of sizes and ranks.
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 5e35d4ea..69ec35d0 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -564,32 +564,34 @@ namespace yask {
                 // time-step, the parallelogram may be trimmed based on the
                 // BB and WF extensions outside of the rank-BB.
 
-                // Actual region boundaries must stay within [extended] rank BB.
+                // Actual region boundaries must stay within [extended] pack BB.
                 // We have to calculate the posn in the extended rank at each
                 // value of 'shift_num' because it is being shifted spatially.
                 bool ok = true;
-                for (int i = 0; i < ndims; i++) {
+                auto& pbb = bp->getBB();
+                for (int i = 0, j = 0; i < ndims; i++) {
                     if (i == step_posn) continue;
-                    auto& dname = _dims->_stencil_dims.getDimName(i);
-                    auto angle = wf_angles[dname];
+                    auto angle = wf_angles[j];
 
                     // Begin point.
-                    idx_t dbegin = rank_bb.bb_begin[dname];
-                    idx_t rbegin = max<idx_t>(start[i], ext_bb.bb_begin[dname]);
+                    idx_t dbegin = rank_bb.bb_begin[j]; // non-extended domain.
+                    idx_t rbegin = max<idx_t>(start[i], pbb.bb_begin[j]);
                     if (rbegin < dbegin) // in left WF ext?
-                        rbegin = max(rbegin, dbegin - left_wf_exts[dname] + shift_num * angle);
+                        rbegin = max(rbegin, dbegin - left_wf_exts[j] + shift_num * angle);
                     region_idxs.begin[i] = rbegin;
 
                     // End point.
-                    idx_t dend = rank_bb.bb_end[dname];
-                    idx_t rend = min<idx_t>(stop[i], ext_bb.bb_end[dname]);
+                    idx_t dend = rank_bb.bb_end[j]; // non-extended domain.
+                    idx_t rend = min<idx_t>(stop[i], pbb.bb_end[j]);
                     if (rend > dend) // in right WF ext?
-                        rend = min(rend, dend + right_wf_exts[dname] - shift_num * angle);
+                        rend = min(rend, dend + right_wf_exts[j] - shift_num * angle);
                     region_idxs.end[i] = rend;
 
                     // Anything to do?
                     if (rend <= rbegin)
                         ok = false;
+
+                    j++; // next domain index.
                 }
                 TRACE_MSG("calc_region: region span after trimming: " <<
                           region_idxs.begin.makeValStr(ndims) <<
@@ -624,13 +626,13 @@ namespace yask {
                 // left, so region loops must strictly increment. They may do
                 // so in any order.  TODO: shift only what is needed by
                 // this pack, not the global max.
-                for (int i = 0; i < ndims; i++) {
+                for (int i = 0, j = 0; i < ndims; i++) {
                     if (i == step_posn) continue;
-                    auto& dname = _dims->_stencil_dims.getDimName(i);
-                    auto angle = wf_angles[dname];
+                    auto angle = wf_angles[j];
 
                     start[i] -= angle;
                     stop[i] -= angle;
+                    j++;
                 }
                 shift_num++;
 
diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp
index 649e9292..4324f369 100644
--- a/src/kernel/lib/context.hpp
+++ b/src/kernel/lib/context.hpp
@@ -173,7 +173,7 @@ namespace yask {
         BoundingBox rank_bb;
 
         // BB with any needed extensions for wave-fronts.
-        // If WFs are not used, this is the same as rank_bb;
+        // If WFs are not used, this is the same as 'rank_bb';
         BoundingBox ext_bb;
 
         // List of all non-scratch stencil bundles in the order in which
diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
index aea4664a..5aa37489 100644
--- a/src/kernel/lib/grid_apis.cpp
+++ b/src/kernel/lib/grid_apis.cpp
@@ -158,7 +158,7 @@ namespace yask {
 
         // Determine required padding from halos.
         Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts);
-        Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts);
+        Indices right_pads2 = getReqdPad(_right_halos, _right_wf_exts);
 
         // NB: requirements to successful share_storage() is not as strict as
         // is_storage_layout_identical(). See note on pad & halo below and API docs.
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index 291dd071..540cbcb0 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -111,14 +111,16 @@ namespace yask {
             // Myself.
             if (rn == me) {
                 if (mandist != 0)
-                    FORMAT_AND_THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist);
+                    FORMAT_AND_THROW_YASK_EXCEPTION
+                        ("Internal error: distance to own rank == " << mandist);
             }
 
             // Someone else.
             else {
                 if (mandist == 0)
-                    FORMAT_AND_THROW_YASK_EXCEPTION("Error: ranks " << me <<
-                                                    " and " << rn << " at same coordinates");
+                    FORMAT_AND_THROW_YASK_EXCEPTION
+                        ("Error: ranks " << me <<
+                         " and " << rn << " at same coordinates");
             }
 
             // Loop through domain dims.
@@ -157,13 +159,14 @@ namespace yask {
                             auto rnsz = rsizes[rn][dj];
                             if (mysz != rnsz) {
                                 auto& dnamej = _opts->_rank_indices.getDimName(dj);
-                                FORMAT_AND_THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me <<
-                                                                " are both at rank-index " << coords[me][di] <<
-                                                                " in the '" << dname <<
-                                                                "' dimension , but their rank-domain sizes are " <<
-                                                                rnsz << " and " << mysz <<
-                                                                " (resp.) in the '" << dj <<
-                                                                "' dimension, making them unaligned");
+                                FORMAT_AND_THROW_YASK_EXCEPTION
+                                    ("Error: rank " << rn << " and " << me <<
+                                     " are both at rank-index " << coords[me][di] <<
+                                     " in the '" << dname <<
+                                     "' dimension , but their rank-domain sizes are " <<
+                                     rnsz << " and " << mysz <<
+                                     " (resp.) in the '" << dj <<
+                                     "' dimension, making them unaligned");
                             }
                         }
                     }
@@ -316,7 +319,7 @@ namespace yask {
                     // Determine padded size (also offset to next location).
                     size_t nbytes = gp->get_num_storage_bytes();
                     npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad,
-                                                  CACHELINE_BYTES);
+                                                   CACHELINE_BYTES);
                     ngrids[numa_pref]++;
                     if (pass == 0)
                         TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) <<
@@ -880,6 +883,11 @@ namespace yask {
     {
         assert(_opts);
 
+        // If we haven't finished constructing the context, it's too early
+        // to do this.
+        if (!stPacks.size())
+            return;
+
         // Reset halos to zero.
         max_halos = _dims->_domain_dims;
 
@@ -921,15 +929,19 @@ namespace yask {
         // of angles and extensions.
         auto& step_dim = _dims->_step_dim;
         auto wf_steps = _opts->_region_sizes[step_dim];
+        assert(wf_steps >= 1);
         num_wf_shifts = 0;
         if (wf_steps > 1) {
 
             // Need to shift for each bundle pack.
-            num_wf_shifts = stPacks.size() * wf_steps;
+            assert(stPacks.size() > 0);
+            num_wf_shifts = idx_t(stPacks.size()) * wf_steps;
+            assert(num_wf_shifts > 1);
 
             // Don't need to shift first one.
             num_wf_shifts--;
         }
+        assert(num_wf_shifts >= 0);
         for (auto& dim : _dims->_domain_dims.getDims()) {
             auto& dname = dim.getName();
             auto rksize = _opts->_rank_sizes[dname];
@@ -945,10 +957,12 @@ namespace yask {
             if (_opts->_region_sizes[dname] < rksize || nranks > 0)
                 angle = ROUND_UP(max_halos[dname], _dims->_fold_pts[dname]);
             wf_angles[dname] = angle;
+            assert(angle >= 0);
 
             // Determine the total WF shift to be added in each dim.
             idx_t shifts = angle * num_wf_shifts;
             wf_shifts[dname] = shifts;
+            assert(shifts >= 0);
 
             // Is domain size at least as large as halo + wf_ext in direction
             // when there are multiple ranks?
@@ -1117,7 +1131,7 @@ namespace yask {
             " rank-domain-offsets:   " << rank_domain_offsets.makeDimValOffsetStr() << endl <<
 #endif
             " rank-domain:           " << rank_bb.bb_begin.makeDimValStr() <<
-                " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl <<
+            " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl <<
             " vector-len:            " << VLEN << endl <<
             " extra-padding:         " << _opts->_extra_pad_sizes.makeDimValStr() << endl <<
             " minimum-padding:       " << _opts->_min_pad_sizes.makeDimValStr() << endl <<
@@ -1137,9 +1151,9 @@ namespace yask {
         os << endl;
 
         // Info about eqs, packs and bundles.
-        os << "Num stencil equations: " << NUM_STENCIL_EQS << endl;
-        os << "Num stencil bundles: " << stBundles.size() << endl;
-        os << "Num stencil packs: " << stPacks.size() << endl;
+        os << "Num stencil packs:      " << stPacks.size() << endl;
+        os << "Num stencil bundles:    " << stBundles.size() << endl;
+        os << "Num stencil equations:  " << NUM_STENCIL_EQS << endl;
 
 #if NUM_STENCIL_EQS
 
@@ -1149,9 +1163,13 @@ namespace yask {
         rank_numFpOps_1t = 0;
 
         for (auto& sp : stPacks) {
-            os << "Bundle(s) in pack '" << sp->get_name() << "':\n";
-            for (auto* sg : *sp) {
+            auto& pbb = sp->getBB();
+            os << "Pack '" << sp->get_name() << "':\n" <<
+                " num bundles:                 " << sp->size() << endl <<
+                " sub-domain scope:            " << pbb.bb_begin.makeDimValStr() <<
+                " ... " << pbb.bb_end.subElements(1).makeDimValStr() << endl;
 
+            for (auto* sg : *sp) {
                 idx_t updates1 = 0, reads1 = 0, fpops1 = 0;
 
                 // Loop through all the needed bundles to
@@ -1325,10 +1343,23 @@ namespace yask {
         ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts);
         ext_bb.update_bb("extended-rank", *this, true);
 
-        // Find BB for each bundle. Each will be a subset within
-        // 'ext_bb'.
-        for (auto sg : stBundles)
-            sg->find_bounding_box();
+        // Find BB for each pack.
+        for (auto sp : stPacks) {
+            auto& spbb = sp->getBB();
+            spbb.bb_begin = _dims->_domain_dims;
+            spbb.bb_end = _dims->_domain_dims;
+
+            // Find BB for each bundle in this pack.
+            for (auto sb : *sp) {
+                sb->find_bounding_box();
+
+                // Expand pack BB to encompass bundle BB.
+                auto& sbbb = sb->getBB();
+                spbb.bb_begin = spbb.bb_begin.minElements(sbbb.bb_begin);
+                spbb.bb_end = spbb.bb_end.maxElements(sbbb.bb_end);
+            }
+            spbb.update_bb(sp->get_name(), *this, false);
+        }
     }
 
     // Find the bounding-boxes for this bundle in this rank.
@@ -1352,9 +1383,8 @@ namespace yask {
         Indices max_pts(idx_min, nsdims);
         idx_t npts = 0;
 
-        // Begin, end tuples.
-        // Scan across domain in this rank including
-        // any extensions for wave-fronts.
+        // Begin, end tuples. Use 'ext_bb' to scan across domain in this
+        // rank including any extensions for wave-fronts.
         IdxTuple begin(stencil_dims);
         begin.setVals(context.ext_bb.bb_begin, false);
         begin[step_dim] = 0;
@@ -1369,11 +1399,11 @@ namespace yask {
 
         // Define misc-loop function.  Since step is always 1, we ignore
         // misc_stop.  Update only if point is in domain for this bundle.
-#define misc_fn(misc_idxs) do {                                  \
-            if (is_in_valid_domain(misc_idxs.start)) {           \
-                min_pts = min_pts.minElements(misc_idxs.start);  \
-                max_pts = max_pts.maxElements(misc_idxs.start);  \
-                npts++;                                          \
+#define misc_fn(misc_idxs) do {                                 \
+            if (is_in_valid_domain(misc_idxs.start)) {          \
+                min_pts = min_pts.minElements(misc_idxs.start); \
+                max_pts = max_pts.maxElements(misc_idxs.start); \
+                npts++;                                         \
             } } while(0)
         
         // Define OMP reductions to be used in generated code.
@@ -1631,11 +1661,11 @@ namespace yask {
         bb_is_full = true;
         if (bb_num_points != bb_size) {
             if (os)
-            *os << "Note: '" << name << "' domain has only " <<
-                makeNumStr(bb_num_points) <<
-                " valid point(s) inside its bounding-box of " <<
-                makeNumStr(bb_size) <<
-                " point(s); multiple sub-boxes will be used.\n";
+                *os << "Note: '" << name << "' domain has only " <<
+                    makeNumStr(bb_num_points) <<
+                    " valid point(s) inside its bounding-box of " <<
+                    makeNumStr(bb_size) <<
+                    " point(s); multiple sub-boxes will be used.\n";
             bb_is_full = false;
         }
 
@@ -1646,9 +1676,9 @@ namespace yask {
             if ((bb_begin[dname] - context.rank_domain_offsets[dname]) %
                 dims->_fold_pts[dname] != 0) {
                 if (os)
-                *os << "Note: '" << name << "' domain"
-                    " has one or more starting edges not on vector boundaries;"
-                    " masked calculations will be used in peel and remainder sub-blocks.\n";
+                    *os << "Note: '" << name << "' domain"
+                        " has one or more starting edges not on vector boundaries;"
+                        " masked calculations will be used in peel and remainder sub-blocks.\n";
                 bb_is_aligned = false;
                 break;
             }
@@ -1661,9 +1691,9 @@ namespace yask {
             if (bb_len[dname] % dims->_cluster_pts[dname] != 0) {
                 if (bb_is_full && bb_is_aligned)
                     if (os && bb_is_aligned)
-                    *os << "Note: '" << name << "' domain"
-                        " has one or more sizes that are not vector-cluster multiples;"
-                        " masked calculations will be used in peel and remainder sub-blocks.\n";
+                        *os << "Note: '" << name << "' domain"
+                            " has one or more sizes that are not vector-cluster multiples;"
+                            " masked calculations will be used in peel and remainder sub-blocks.\n";
                 bb_is_cluster_mult = false;
                 break;
             }
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index 328d966e..d7aa47c7 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -62,20 +62,16 @@ namespace yask {
 
             // Trim the default block indices based on the bounding box(es)
             // for this bundle.
-            // TODO: replace string-based lookup w/indices.
             ScanIndices bb_idxs(def_block_idxs);
-            for (int i = 0; i < nsdims; i++) {
+            for (int i = 0, j = 0; i < nsdims; i++) {
                 if (i == step_posn) continue;
-                auto& dname = dims->_stencil_dims.getDimName(i);
 
                 // Begin point.
-                assert(bb.bb_begin.lookup(dname));
-                auto bbegin = max(def_block_idxs.begin[i], bb.bb_begin[dname]);
+                auto bbegin = max(def_block_idxs.begin[i], bb.bb_begin[j]);
                 bb_idxs.begin[i] = bbegin;
 
                 // End point.
-                assert(bb.bb_end.lookup(dname));
-                auto bend = min(def_block_idxs.end[i], bb.bb_end[dname]);
+                auto bend = min(def_block_idxs.end[i], bb.bb_end[j]);
                 bb_idxs.end[i] = bend;
 		
                 // Anything to do?
@@ -83,6 +79,7 @@ namespace yask {
                     bb_ok = false;
                     break;
                 }
+                j++;            // next domain index.
             }
 
             // nothing to do?
@@ -92,42 +89,42 @@ namespace yask {
                 continue; // to next BB.
             }
             
-        TRACE_MSG3("calc_block for bundle '" << get_name() <<
-                   "': after trimming for BB " << bbn << ": " <<
-                   bb_idxs.begin.makeValStr(nsdims) <<
-                   " ... (end before) " << bb_idxs.end.makeValStr(nsdims));
-
-        // Update offsets of scratch grids based on this bundle's location.
-        _generic_context->update_scratch_grid_info(thread_idx, bb_idxs.begin);
-
-        // Get the bundles that need to be processed in
-        // this block. This will be any prerequisite scratch-grid
-        // bundles plus this non-scratch bundle.
-        auto sg_list = get_reqd_bundles();
-
-        // Set number of threads for a block.
-        // Each of these threads will work on a sub-block.
-        // This should be nested within a top-level OpenMP task.
-        _generic_context->set_block_threads();
-
-        // Loop through all the needed bundles.
-        for (auto* sg : sg_list) {
-
-            // Indices needed for the generated loops.  Will normally be a
-            // copy of 'bb_idxs' except when updating scratch-grids.
-            ScanIndices block_idxs = sg->adjust_span(thread_idx, bb_idxs);
-
-            TRACE_MSG3("calc_block for bundle '" << get_name() << "': " <<
-                       " in reqd bundle '" << sg->get_name() << "': " <<
-                       block_idxs.begin.makeValStr(nsdims) <<
-                       " ... (end before) " << block_idxs.end.makeValStr(nsdims) <<
-                       " by thread " << thread_idx);
-
-            // Include automatically-generated loop code that calls
-            // calc_sub_block() for each sub-block in this block. This
-            // code typically contains the nested OpenMP loop(s).
+            TRACE_MSG3("calc_block for bundle '" << get_name() <<
+                       "': after trimming for BB " << bbn << ": " <<
+                       bb_idxs.begin.makeValStr(nsdims) <<
+                       " ... (end before) " << bb_idxs.end.makeValStr(nsdims));
+
+            // Update offsets of scratch grids based on this bundle's location.
+            _generic_context->update_scratch_grid_info(thread_idx, bb_idxs.begin);
+
+            // Get the bundles that need to be processed in
+            // this block. This will be any prerequisite scratch-grid
+            // bundles plus this non-scratch bundle.
+            auto sg_list = get_reqd_bundles();
+
+            // Set number of threads for a block.
+            // Each of these threads will work on a sub-block.
+            // This should be nested within a top-level OpenMP task.
+            _generic_context->set_block_threads();
+
+            // Loop through all the needed bundles.
+            for (auto* sg : sg_list) {
+
+                // Indices needed for the generated loops.  Will normally be a
+                // copy of 'bb_idxs' except when updating scratch-grids.
+                ScanIndices block_idxs = sg->adjust_span(thread_idx, bb_idxs);
+
+                TRACE_MSG3("calc_block for bundle '" << get_name() << "': " <<
+                           " in reqd bundle '" << sg->get_name() << "': " <<
+                           block_idxs.begin.makeValStr(nsdims) <<
+                           " ... (end before) " << block_idxs.end.makeValStr(nsdims) <<
+                           " by thread " << thread_idx);
+
+                // Include automatically-generated loop code that calls
+                // calc_sub_block() for each sub-block in this block. This
+                // code typically contains the nested OpenMP loop(s).
 #include "yask_block_loops.hpp"
-        }
+            }
         } // BB list.
     }
 
@@ -182,10 +179,10 @@ namespace yask {
         /*
           Indices in each domain dim:
 
-            sub_block_eidxs.begin                      rem_masks used here
-            |peel_masks used here                      | sub_block_eidxs.end
-            ||                                         | |
-            vv                                         v v
+          sub_block_eidxs.begin                      rem_masks used here
+          |peel_masks used here                      | sub_block_eidxs.end
+          ||                                         | |
+          vv                                         v v
           |---|-------|---------------------------|---|---|  <- "|" on vec boundaries.
           ^   ^       ^                            ^   ^   ^
           |   |       |                            |   |   |
@@ -251,158 +248,158 @@ namespace yask {
         // Determine the subset of this sub-block that is
         // clusters, vectors, and partial vectors.
 #else
-            do_clusters = true;
-            do_vectors = false;
-            do_scalars = false;
+        do_clusters = true;
+        do_vectors = false;
+        do_scalars = false;
 
-            // i: index for stencil dims, j: index for domain dims.
-            for (int i = 0, j = 0; i < nsdims; i++) {
-                if (i != step_posn) {
-
-                    // Rank offset.
-                    auto rofs = cp->rank_domain_offsets[j];
-
-                    // Begin/end of rank-relative scalar elements in this dim.
-                    auto ebgn = sub_block_idxs.begin[i] - rofs;
-                    auto eend = sub_block_idxs.end[i] - rofs;
-                    sub_block_eidxs.begin[i] = ebgn;
-                    sub_block_eidxs.end[i] = eend;
-
-                    // Find range of full clusters.
-                    // Note that fcend <= eend because we round
-                    // down to get whole clusters only.
-                    // Similarly, fcbgn >= ebgn.
-                    auto cpts = dims->_cluster_pts[j];
-                    auto fcbgn = round_up_flr(ebgn, cpts);
-                    auto fcend = round_down_flr(eend, cpts);
-                    sub_block_fcidxs.begin[i] = fcbgn;
-                    sub_block_fcidxs.end[i] = fcend;
-
-                    // Any clusters to do?
-                    if (fcend <= fcbgn)
-                        do_clusters = false;
-
-                    // If anything before or after clusters, continue with
-                    // setting vector indices and peel/rem masks.
-                    if (fcbgn > ebgn || fcend < eend) {
-
-                        // Find range of full and/or partial vectors.
-                        // Note that fvend <= eend because we round
-                        // down to get whole vectors only.
-                        // Note that vend >= eend because we round
-                        // up to include partial vectors.
-                        // Similar but opposite for begin vars.
-                        // We make a vector mask to pick the
-                        // right elements.
-                        // TODO: use compile-time consts instead
-                        // of _fold_pts for more efficiency.
-                        auto vpts = dims->_fold_pts[j];
-                        auto fvbgn = round_up_flr(ebgn, vpts);
-                        auto fvend = round_down_flr(eend, vpts);
-                        auto vbgn = round_down_flr(ebgn, vpts);
-                        auto vend = round_up_flr(eend, vpts);
-                        if (i == _inner_posn) {
-
-                            // Don't do any full and/or partial vectors in
-                            // plane of inner dim.  We'll do these with
-                            // scalars.  This is unusual because vector
-                            // folding is normally done in a plane
-                            // perpendicular to the inner dim for >= 2D
-                            // domains.
-                            fvbgn = vbgn = fcbgn;
-                            fvend = vend = fcend;
-                        }
-                        sub_block_fvidxs.begin[i] = fvbgn;
-                        sub_block_fvidxs.end[i] = fvend;
-                        sub_block_vidxs.begin[i] = vbgn;
-                        sub_block_vidxs.end[i] = vend;
-
-                        // Any vectors to do (full and/or partial)?
-                        if (vbgn < fcbgn || vend > fcend)
-                            do_vectors = true;
-
-                        // Calculate masks in this dim for partial vectors.
-                        // All such masks will be ANDed together to form the
-                        // final masks over all domain dims.
-                        // Example: assume folding is x=4*y=4.
-                        // Possible 'x' peel mask to exclude 1st 2 cols:
-                        //   0 0 1 1
-                        //   0 0 1 1
-                        //   0 0 1 1
-                        //   0 0 1 1
-                        // Possible 'y' peel mask to exclude 1st row:
-                        //   0 0 0 0
-                        //   1 1 1 1
-                        //   1 1 1 1
-                        //   1 1 1 1
-                        // Along 'x' face, the 'x' peel mask is used.
-                        // Along 'y' face, the 'y' peel mask is used.
-                        // Along an 'x-y' edge, they are ANDed to make this mask:
-                        //   0 0 0 0
-                        //   0 0 1 1
-                        //   0 0 1 1
-                        //   0 0 1 1
-                        // so that the 6 corner elements are updated.
-
-                        if (vbgn < fvbgn || vend > fvend) {
-                            idx_t pmask = 0, rmask = 0;
-
-                            // Need to set upper bit.
-                            idx_t mbit = 0x1 << (dims->_fold_pts.product() - 1);
-
-                            // Visit points in a vec-fold.
-                            dims->_fold_pts.visitAllPoints
-                                ([&](const IdxTuple& pt, size_t idx) {
-
-                                    // Shift masks to next posn.
-                                    pmask >>= 1;
-                                    rmask >>= 1;
-
-                                    // If the peel point is within the sub-block,
-                                    // set the next bit in the mask.
-                                    idx_t pi = vbgn + pt[j];
-                                    if (pi >= ebgn)
-                                        pmask |= mbit;
-
-                                    // If the rem point is within the sub-block,
-                                    // put a 1 in the mask.
-                                    pi = fvend + pt[j];
-                                    if (pi < eend)
-                                        rmask |= mbit;
-
-                                    // Keep visiting.
-                                    return true;
-                                });
-
-                            // Save masks in this dim.
-                            peel_masks[i] = pmask;
-                            rem_masks[i] = rmask;
-                        }
-
-                        // Anything not covered?
-                        // This will only be needed in inner dim because we
-                        // will do partial vectors in other dims.
-                        // Set 'scalar_for_peel_rem' to indicate we only want to
-                        // do peel and/or rem in scalar loop.
-                        if (i == _inner_posn && (ebgn < vbgn || eend > vend)) {
-                            do_scalars = true;
-                            scalar_for_peel_rem = true;
-                        }
+        // i: index for stencil dims, j: index for domain dims.
+        for (int i = 0, j = 0; i < nsdims; i++) {
+            if (i != step_posn) {
+
+                // Rank offset.
+                auto rofs = cp->rank_domain_offsets[j];
+
+                // Begin/end of rank-relative scalar elements in this dim.
+                auto ebgn = sub_block_idxs.begin[i] - rofs;
+                auto eend = sub_block_idxs.end[i] - rofs;
+                sub_block_eidxs.begin[i] = ebgn;
+                sub_block_eidxs.end[i] = eend;
+
+                // Find range of full clusters.
+                // Note that fcend <= eend because we round
+                // down to get whole clusters only.
+                // Similarly, fcbgn >= ebgn.
+                auto cpts = dims->_cluster_pts[j];
+                auto fcbgn = round_up_flr(ebgn, cpts);
+                auto fcend = round_down_flr(eend, cpts);
+                sub_block_fcidxs.begin[i] = fcbgn;
+                sub_block_fcidxs.end[i] = fcend;
+
+                // Any clusters to do?
+                if (fcend <= fcbgn)
+                    do_clusters = false;
+
+                // If anything before or after clusters, continue with
+                // setting vector indices and peel/rem masks.
+                if (fcbgn > ebgn || fcend < eend) {
+
+                    // Find range of full and/or partial vectors.
+                    // Note that fvend <= eend because we round
+                    // down to get whole vectors only.
+                    // Note that vend >= eend because we round
+                    // up to include partial vectors.
+                    // Similar but opposite for begin vars.
+                    // We make a vector mask to pick the
+                    // right elements.
+                    // TODO: use compile-time consts instead
+                    // of _fold_pts for more efficiency.
+                    auto vpts = dims->_fold_pts[j];
+                    auto fvbgn = round_up_flr(ebgn, vpts);
+                    auto fvend = round_down_flr(eend, vpts);
+                    auto vbgn = round_down_flr(ebgn, vpts);
+                    auto vend = round_up_flr(eend, vpts);
+                    if (i == _inner_posn) {
+
+                        // Don't do any full and/or partial vectors in
+                        // plane of inner dim.  We'll do these with
+                        // scalars.  This is unusual because vector
+                        // folding is normally done in a plane
+                        // perpendicular to the inner dim for >= 2D
+                        // domains.
+                        fvbgn = vbgn = fcbgn;
+                        fvend = vend = fcend;
+                    }
+                    sub_block_fvidxs.begin[i] = fvbgn;
+                    sub_block_fvidxs.end[i] = fvend;
+                    sub_block_vidxs.begin[i] = vbgn;
+                    sub_block_vidxs.end[i] = vend;
+
+                    // Any vectors to do (full and/or partial)?
+                    if (vbgn < fcbgn || vend > fcend)
+                        do_vectors = true;
+
+                    // Calculate masks in this dim for partial vectors.
+                    // All such masks will be ANDed together to form the
+                    // final masks over all domain dims.
+                    // Example: assume folding is x=4*y=4.
+                    // Possible 'x' peel mask to exclude 1st 2 cols:
+                    //   0 0 1 1
+                    //   0 0 1 1
+                    //   0 0 1 1
+                    //   0 0 1 1
+                    // Possible 'y' peel mask to exclude 1st row:
+                    //   0 0 0 0
+                    //   1 1 1 1
+                    //   1 1 1 1
+                    //   1 1 1 1
+                    // Along 'x' face, the 'x' peel mask is used.
+                    // Along 'y' face, the 'y' peel mask is used.
+                    // Along an 'x-y' edge, they are ANDed to make this mask:
+                    //   0 0 0 0
+                    //   0 0 1 1
+                    //   0 0 1 1
+                    //   0 0 1 1
+                    // so that the 6 corner elements are updated.
+
+                    if (vbgn < fvbgn || vend > fvend) {
+                        idx_t pmask = 0, rmask = 0;
+
+                        // Need to set upper bit.
+                        idx_t mbit = 0x1 << (dims->_fold_pts.product() - 1);
+
+                        // Visit points in a vec-fold.
+                        dims->_fold_pts.visitAllPoints
+                            ([&](const IdxTuple& pt, size_t idx) {
+
+                                // Shift masks to next posn.
+                                pmask >>= 1;
+                                rmask >>= 1;
+
+                                // If the peel point is within the sub-block,
+                                // set the next bit in the mask.
+                                idx_t pi = vbgn + pt[j];
+                                if (pi >= ebgn)
+                                    pmask |= mbit;
+
+                                // If the rem point is within the sub-block,
+                                // put a 1 in the mask.
+                                pi = fvend + pt[j];
+                                if (pi < eend)
+                                    rmask |= mbit;
+
+                                // Keep visiting.
+                                return true;
+                            });
+
+                        // Save masks in this dim.
+                        peel_masks[i] = pmask;
+                        rem_masks[i] = rmask;
                     }
 
-                    // If no peel or rem, just set vec indices to same as
-                    // full cluster.
-                    else {
-                        sub_block_fvidxs.begin[i] = fcbgn;
-                        sub_block_fvidxs.end[i] = fcend;
-                        sub_block_vidxs.begin[i] = fcbgn;
-                        sub_block_vidxs.end[i] = fcend;
+                    // Anything not covered?
+                    // This will only be needed in inner dim because we
+                    // will do partial vectors in other dims.
+                    // Set 'scalar_for_peel_rem' to indicate we only want to
+                    // do peel and/or rem in scalar loop.
+                    if (i == _inner_posn && (ebgn < vbgn || eend > vend)) {
+                        do_scalars = true;
+                        scalar_for_peel_rem = true;
                     }
+                }
 
-                    // Next domain index.
-                    j++;
+                // If no peel or rem, just set vec indices to same as
+                // full cluster.
+                else {
+                    sub_block_fvidxs.begin[i] = fcbgn;
+                    sub_block_fvidxs.end[i] = fcend;
+                    sub_block_vidxs.begin[i] = fcbgn;
+                    sub_block_vidxs.end[i] = fcend;
                 }
+
+                // Next domain index.
+                j++;
             }
+        }
 #endif
             
         // Normalized indices needed for sub-block loop.
@@ -435,7 +432,7 @@ namespace yask {
 
             // Define the function called from the generated loops
             // to simply call the loop-of-clusters functions.
-#define calc_inner_loop(thread_idx, loop_idxs) \
+#define calc_inner_loop(thread_idx, loop_idxs)                  \
             calc_loop_of_clusters(thread_idx, loop_idxs)
 
             // Include automatically-generated loop code that calls
@@ -492,7 +489,7 @@ namespace yask {
             // See the mask diagrams above that show how the
             // masks are ANDed together.
             // Since step is always 1, we ignore loop_idxs.stop.
-#define calc_inner_loop(thread_idx, loop_idxs) \
+#define calc_inner_loop(thread_idx, loop_idxs)                          \
             bool ok = false;                                            \
             idx_t mask = idx_t(-1);                                     \
             for (int i = 0; i < nsdims; i++) {                          \
@@ -500,11 +497,11 @@ namespace yask {
                     i != _inner_posn &&                                 \
                     (loop_idxs.start[i] < norm_sub_block_fcidxs.begin[i] || \
                      loop_idxs.start[i] >= norm_sub_block_fcidxs.end[i])) { \
-                        ok = true;                                      \
-                        if (loop_idxs.start[i] < norm_sub_block_fvidxs.begin[i]) \
-                            mask &= peel_masks[i];                      \
-                        if (loop_idxs.start[i] >= norm_sub_block_fvidxs.end[i]) \
-                            mask &= rem_masks[i];                       \
+                    ok = true;                                          \
+                    if (loop_idxs.start[i] < norm_sub_block_fvidxs.begin[i]) \
+                        mask &= peel_masks[i];                          \
+                    if (loop_idxs.start[i] >= norm_sub_block_fvidxs.end[i]) \
+                        mask &= rem_masks[i];                           \
                 }                                                       \
             }                                                           \
             if (ok) calc_loop_of_vectors(thread_idx, loop_idxs, mask);
@@ -564,7 +561,7 @@ namespace yask {
 
         // Make sure streaming stores are visible for later loads.
         make_stores_visible();
-
+        
     } // calc_sub_block.
 
     // Calculate a series of cluster results within an inner loop.
@@ -572,7 +569,7 @@ namespace yask {
     // Indices must be rank-relative.
     // Indices must be normalized, i.e., already divided by VLEN_*.
     void StencilBundleBase::calc_loop_of_clusters(int thread_idx,
-                                                 const ScanIndices& loop_idxs) {
+                                                  const ScanIndices& loop_idxs) {
         auto* cp = _generic_context;
         auto& dims = cp->get_dims();
         int nsdims = dims->_stencil_dims.size();
@@ -608,8 +605,8 @@ namespace yask {
     // Indices must be rank-relative.
     // Indices must be normalized, i.e., already divided by VLEN_*.
     void StencilBundleBase::calc_loop_of_vectors(int thread_idx,
-                                                const ScanIndices& loop_idxs,
-                                                idx_t write_mask) {
+                                                 const ScanIndices& loop_idxs,
+                                                 idx_t write_mask) {
         auto* cp = _generic_context;
         auto& dims = cp->get_dims();
         int nsdims = dims->_stencil_dims.size();
@@ -649,8 +646,9 @@ namespace yask {
     // its halo sizes are still used to specify how much to
     // add to 'idxs'.
     // Return adjusted indices.
-    ScanIndices StencilBundleBase::adjust_span(int thread_idx, const ScanIndices& idxs) const {
-
+    ScanIndices StencilBundleBase::adjust_span(int thread_idx,
+                                               const ScanIndices& idxs) const {
+        
         ScanIndices adj_idxs(idxs);
         auto* cp = _generic_context;
         auto& dims = cp->get_dims();
@@ -676,16 +674,16 @@ namespace yask {
                 int posn = gp->get_dim_posn(dname);
                 if (posn >= 0) {
 
-                    // Make sure grid domain covers block.
-                    assert(idxs.begin[i] >= gp->get_first_rank_domain_index(posn));
-                    assert(idxs.end[i] <= gp->get_last_rank_domain_index(posn) + 1);
-
                     // Adjust begin & end scan indices based on halos.
                     idx_t lh = gp->get_left_halo_size(posn);
                     idx_t rh = gp->get_right_halo_size(posn);
                     adj_idxs.begin[i] = idxs.begin[i] - lh;
                     adj_idxs.end[i] = idxs.end[i] + rh;
 
+                    // Make sure grid covers block.
+                    assert(adj_idxs.begin[i] >= gp->get_first_rank_alloc_index(posn));
+                    assert(adj_idxs.end[i] <= gp->get_last_rank_alloc_index(posn) + 1);
+
                     // If existing step is >= whole tile, adjust it also.
                     idx_t width = idxs.end[i] - idxs.begin[i];
                     if (idxs.step[i] >= width) {
diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp
index b10e5a98..e562f00e 100644
--- a/src/kernel/lib/stencil_calc.hpp
+++ b/src/kernel/lib/stencil_calc.hpp
@@ -221,12 +221,17 @@ namespace yask {
     };                          // StencilBundleBase.
 
     // A collection of independent stencil bundles.
+    // "Independent" implies that they may be evaluated
+    // in any order.
     class BundlePack :
         public std::vector<StencilBundleBase*> {
 
     protected:
         std::string _name;
 
+        // Union of bounding boxes for all bundles.
+        BoundingBox _pack_bb;
+        
     public:
         BundlePack(const std::string& name) :
             _name(name) { }
@@ -236,6 +241,9 @@ namespace yask {
             return _name;
         }
 
+        // Access to BB.
+        virtual BoundingBox& getBB() { return _pack_bb; }
+
     }; // BundlePack.
 
 } // yask namespace.
diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh
index 372bd585..177051de 100755
--- a/src/kernel/yask.sh
+++ b/src/kernel/yask.sh
@@ -271,6 +271,20 @@ fi
 
 echo "Log saved in '$logfile'."
 
-if [[ `grep -c FAILED $logfile` > 0 ]]; then
+# Checks for issues.
+exe_str="'$mpi_cmd $exe_prefix $exe $opts $@'"
+
+# Return a non-zero exit condition if test failed.
+if [[ `grep -c 'TEST FAILED' $logfile` > 0 ]]; then
+    echo $exe_str did not pass internal validation test.
     exit 1;
 fi
+
+# Return a non-zero exit condition if executable didn't exit cleanly.
+if [[ `grep -c 'YASK DONE' $logfile` == 0 ]]; then
+    echo $exe_str did not exit cleanly.
+    exit 1;
+fi
+
+echo $exe_str ran successfully.
+exit 0;
diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp
index ca30aa41..dca6dbf0 100644
--- a/src/kernel/yask_main.cpp
+++ b/src/kernel/yask_main.cpp
@@ -256,13 +256,15 @@ int main(int argc, char** argv)
         auto ksoln = kfac.new_solution(kenv);
         auto context = dynamic_pointer_cast<StencilContext>(ksoln);
         assert(context.get());
+
+        // Replace the default settings with 'opts'.
         context->set_settings(opts);
-        ostream& os = context->set_ostr();
 
         // Make sure any MPI/OMP debug data is dumped from all ranks before continuing.
         kenv->global_barrier();
 
         // Print splash banner and related info.
+        ostream& os = context->set_ostr();
         opts->splash(os, argc, argv);
 
         // Override alloc if requested.
@@ -379,6 +381,7 @@ int main(int argc, char** argv)
             ref_context->name += "-reference";
             ref_context->allow_vec_exchange = false;   // exchange scalars in halos.
 
+            // Override allocations and prep solution as with ref soln.
             alloc_steps(ref_soln, *opts);
             ref_soln->prepare_solution();