diff --git a/Makefile b/Makefile index f032a10b..760985fe 100644 --- a/Makefile +++ b/Makefile @@ -221,8 +221,8 @@ api-tests: compiler-api all-tests: compiler-api $(MAKE) tuple-test - $(MAKE) combo-api-tests $(YK_MAKE) $@ + $(MAKE) combo-api-tests all: $(MAKE) realclean diff --git a/docs/YASK-intro.pdf b/docs/YASK-intro.pdf old mode 100755 new mode 100644 index 67b28457..584c1026 Binary files a/docs/YASK-intro.pdf and b/docs/YASK-intro.pdf differ diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index a8876243..d05a50bb 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -41,7 +41,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.10.00"; + const string version = "2.10.02"; string yask_get_version_string() { return version; diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 42803aba..f08766b8 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -762,7 +762,7 @@ py-api-no-yc: # Validation runs for each binary. val1 := -dt 2 -b 16 -d 48 -val2 := -dt 2 -b 24 -r 32 -rt 2 -dx 63 -dy 49 -dz 47 +val2 := -dt 2 -b 24 -r 32 -rt 2 -d 63 ranks := 2 # Run the kernel binary using several combos of sizes and ranks. diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 5e35d4ea..69ec35d0 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -564,32 +564,34 @@ namespace yask { // time-step, the parallelogram may be trimmed based on the // BB and WF extensions outside of the rank-BB. - // Actual region boundaries must stay within [extended] rank BB. + // Actual region boundaries must stay within [extended] pack BB. // We have to calculate the posn in the extended rank at each // value of 'shift_num' because it is being shifted spatially. bool ok = true; - for (int i = 0; i < ndims; i++) { + auto& pbb = bp->getBB(); + for (int i = 0, j = 0; i < ndims; i++) { if (i == step_posn) continue; - auto& dname = _dims->_stencil_dims.getDimName(i); - auto angle = wf_angles[dname]; + auto angle = wf_angles[j]; // Begin point. - idx_t dbegin = rank_bb.bb_begin[dname]; - idx_t rbegin = max(start[i], ext_bb.bb_begin[dname]); + idx_t dbegin = rank_bb.bb_begin[j]; // non-extended domain. + idx_t rbegin = max(start[i], pbb.bb_begin[j]); if (rbegin < dbegin) // in left WF ext? - rbegin = max(rbegin, dbegin - left_wf_exts[dname] + shift_num * angle); + rbegin = max(rbegin, dbegin - left_wf_exts[j] + shift_num * angle); region_idxs.begin[i] = rbegin; // End point. - idx_t dend = rank_bb.bb_end[dname]; - idx_t rend = min(stop[i], ext_bb.bb_end[dname]); + idx_t dend = rank_bb.bb_end[j]; // non-extended domain. + idx_t rend = min(stop[i], pbb.bb_end[j]); if (rend > dend) // in right WF ext? - rend = min(rend, dend + right_wf_exts[dname] - shift_num * angle); + rend = min(rend, dend + right_wf_exts[j] - shift_num * angle); region_idxs.end[i] = rend; // Anything to do? if (rend <= rbegin) ok = false; + + j++; // next domain index. } TRACE_MSG("calc_region: region span after trimming: " << region_idxs.begin.makeValStr(ndims) << @@ -624,13 +626,13 @@ namespace yask { // left, so region loops must strictly increment. They may do // so in any order. TODO: shift only what is needed by // this pack, not the global max. - for (int i = 0; i < ndims; i++) { + for (int i = 0, j = 0; i < ndims; i++) { if (i == step_posn) continue; - auto& dname = _dims->_stencil_dims.getDimName(i); - auto angle = wf_angles[dname]; + auto angle = wf_angles[j]; start[i] -= angle; stop[i] -= angle; + j++; } shift_num++; diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index 649e9292..4324f369 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -173,7 +173,7 @@ namespace yask { BoundingBox rank_bb; // BB with any needed extensions for wave-fronts. - // If WFs are not used, this is the same as rank_bb; + // If WFs are not used, this is the same as 'rank_bb'; BoundingBox ext_bb; // List of all non-scratch stencil bundles in the order in which diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp index aea4664a..5aa37489 100644 --- a/src/kernel/lib/grid_apis.cpp +++ b/src/kernel/lib/grid_apis.cpp @@ -158,7 +158,7 @@ namespace yask { // Determine required padding from halos. Indices left_pads2 = getReqdPad(_left_halos, _left_wf_exts); - Indices right_pads2 = getReqdPad(_right_halos, _left_wf_exts); + Indices right_pads2 = getReqdPad(_right_halos, _right_wf_exts); // NB: requirements to successful share_storage() is not as strict as // is_storage_layout_identical(). See note on pad & halo below and API docs. diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index 291dd071..540cbcb0 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -111,14 +111,16 @@ namespace yask { // Myself. if (rn == me) { if (mandist != 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Internal error: distance to own rank == " << mandist); + FORMAT_AND_THROW_YASK_EXCEPTION + ("Internal error: distance to own rank == " << mandist); } // Someone else. else { if (mandist == 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Error: ranks " << me << - " and " << rn << " at same coordinates"); + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: ranks " << me << + " and " << rn << " at same coordinates"); } // Loop through domain dims. @@ -157,13 +159,14 @@ namespace yask { auto rnsz = rsizes[rn][dj]; if (mysz != rnsz) { auto& dnamej = _opts->_rank_indices.getDimName(dj); - FORMAT_AND_THROW_YASK_EXCEPTION("Error: rank " << rn << " and " << me << - " are both at rank-index " << coords[me][di] << - " in the '" << dname << - "' dimension , but their rank-domain sizes are " << - rnsz << " and " << mysz << - " (resp.) in the '" << dj << - "' dimension, making them unaligned"); + FORMAT_AND_THROW_YASK_EXCEPTION + ("Error: rank " << rn << " and " << me << + " are both at rank-index " << coords[me][di] << + " in the '" << dname << + "' dimension , but their rank-domain sizes are " << + rnsz << " and " << mysz << + " (resp.) in the '" << dj << + "' dimension, making them unaligned"); } } } @@ -316,7 +319,7 @@ namespace yask { // Determine padded size (also offset to next location). size_t nbytes = gp->get_num_storage_bytes(); npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, - CACHELINE_BYTES); + CACHELINE_BYTES); ngrids[numa_pref]++; if (pass == 0) TRACE_MSG(" grid '" << gname << "' needs " << makeByteStr(nbytes) << @@ -880,6 +883,11 @@ namespace yask { { assert(_opts); + // If we haven't finished constructing the context, it's too early + // to do this. + if (!stPacks.size()) + return; + // Reset halos to zero. max_halos = _dims->_domain_dims; @@ -921,15 +929,19 @@ namespace yask { // of angles and extensions. auto& step_dim = _dims->_step_dim; auto wf_steps = _opts->_region_sizes[step_dim]; + assert(wf_steps >= 1); num_wf_shifts = 0; if (wf_steps > 1) { // Need to shift for each bundle pack. - num_wf_shifts = stPacks.size() * wf_steps; + assert(stPacks.size() > 0); + num_wf_shifts = idx_t(stPacks.size()) * wf_steps; + assert(num_wf_shifts > 1); // Don't need to shift first one. num_wf_shifts--; } + assert(num_wf_shifts >= 0); for (auto& dim : _dims->_domain_dims.getDims()) { auto& dname = dim.getName(); auto rksize = _opts->_rank_sizes[dname]; @@ -945,10 +957,12 @@ namespace yask { if (_opts->_region_sizes[dname] < rksize || nranks > 0) angle = ROUND_UP(max_halos[dname], _dims->_fold_pts[dname]); wf_angles[dname] = angle; + assert(angle >= 0); // Determine the total WF shift to be added in each dim. idx_t shifts = angle * num_wf_shifts; wf_shifts[dname] = shifts; + assert(shifts >= 0); // Is domain size at least as large as halo + wf_ext in direction // when there are multiple ranks? @@ -1117,7 +1131,7 @@ namespace yask { " rank-domain-offsets: " << rank_domain_offsets.makeDimValOffsetStr() << endl << #endif " rank-domain: " << rank_bb.bb_begin.makeDimValStr() << - " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl << + " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl << " vector-len: " << VLEN << endl << " extra-padding: " << _opts->_extra_pad_sizes.makeDimValStr() << endl << " minimum-padding: " << _opts->_min_pad_sizes.makeDimValStr() << endl << @@ -1137,9 +1151,9 @@ namespace yask { os << endl; // Info about eqs, packs and bundles. - os << "Num stencil equations: " << NUM_STENCIL_EQS << endl; - os << "Num stencil bundles: " << stBundles.size() << endl; - os << "Num stencil packs: " << stPacks.size() << endl; + os << "Num stencil packs: " << stPacks.size() << endl; + os << "Num stencil bundles: " << stBundles.size() << endl; + os << "Num stencil equations: " << NUM_STENCIL_EQS << endl; #if NUM_STENCIL_EQS @@ -1149,9 +1163,13 @@ namespace yask { rank_numFpOps_1t = 0; for (auto& sp : stPacks) { - os << "Bundle(s) in pack '" << sp->get_name() << "':\n"; - for (auto* sg : *sp) { + auto& pbb = sp->getBB(); + os << "Pack '" << sp->get_name() << "':\n" << + " num bundles: " << sp->size() << endl << + " sub-domain scope: " << pbb.bb_begin.makeDimValStr() << + " ... " << pbb.bb_end.subElements(1).makeDimValStr() << endl; + for (auto* sg : *sp) { idx_t updates1 = 0, reads1 = 0, fpops1 = 0; // Loop through all the needed bundles to @@ -1325,10 +1343,23 @@ namespace yask { ext_bb.bb_end = rank_bb.bb_end.addElements(right_wf_exts); ext_bb.update_bb("extended-rank", *this, true); - // Find BB for each bundle. Each will be a subset within - // 'ext_bb'. - for (auto sg : stBundles) - sg->find_bounding_box(); + // Find BB for each pack. + for (auto sp : stPacks) { + auto& spbb = sp->getBB(); + spbb.bb_begin = _dims->_domain_dims; + spbb.bb_end = _dims->_domain_dims; + + // Find BB for each bundle in this pack. + for (auto sb : *sp) { + sb->find_bounding_box(); + + // Expand pack BB to encompass bundle BB. + auto& sbbb = sb->getBB(); + spbb.bb_begin = spbb.bb_begin.minElements(sbbb.bb_begin); + spbb.bb_end = spbb.bb_end.maxElements(sbbb.bb_end); + } + spbb.update_bb(sp->get_name(), *this, false); + } } // Find the bounding-boxes for this bundle in this rank. @@ -1352,9 +1383,8 @@ namespace yask { Indices max_pts(idx_min, nsdims); idx_t npts = 0; - // Begin, end tuples. - // Scan across domain in this rank including - // any extensions for wave-fronts. + // Begin, end tuples. Use 'ext_bb' to scan across domain in this + // rank including any extensions for wave-fronts. IdxTuple begin(stencil_dims); begin.setVals(context.ext_bb.bb_begin, false); begin[step_dim] = 0; @@ -1369,11 +1399,11 @@ namespace yask { // Define misc-loop function. Since step is always 1, we ignore // misc_stop. Update only if point is in domain for this bundle. -#define misc_fn(misc_idxs) do { \ - if (is_in_valid_domain(misc_idxs.start)) { \ - min_pts = min_pts.minElements(misc_idxs.start); \ - max_pts = max_pts.maxElements(misc_idxs.start); \ - npts++; \ +#define misc_fn(misc_idxs) do { \ + if (is_in_valid_domain(misc_idxs.start)) { \ + min_pts = min_pts.minElements(misc_idxs.start); \ + max_pts = max_pts.maxElements(misc_idxs.start); \ + npts++; \ } } while(0) // Define OMP reductions to be used in generated code. @@ -1631,11 +1661,11 @@ namespace yask { bb_is_full = true; if (bb_num_points != bb_size) { if (os) - *os << "Note: '" << name << "' domain has only " << - makeNumStr(bb_num_points) << - " valid point(s) inside its bounding-box of " << - makeNumStr(bb_size) << - " point(s); multiple sub-boxes will be used.\n"; + *os << "Note: '" << name << "' domain has only " << + makeNumStr(bb_num_points) << + " valid point(s) inside its bounding-box of " << + makeNumStr(bb_size) << + " point(s); multiple sub-boxes will be used.\n"; bb_is_full = false; } @@ -1646,9 +1676,9 @@ namespace yask { if ((bb_begin[dname] - context.rank_domain_offsets[dname]) % dims->_fold_pts[dname] != 0) { if (os) - *os << "Note: '" << name << "' domain" - " has one or more starting edges not on vector boundaries;" - " masked calculations will be used in peel and remainder sub-blocks.\n"; + *os << "Note: '" << name << "' domain" + " has one or more starting edges not on vector boundaries;" + " masked calculations will be used in peel and remainder sub-blocks.\n"; bb_is_aligned = false; break; } @@ -1661,9 +1691,9 @@ namespace yask { if (bb_len[dname] % dims->_cluster_pts[dname] != 0) { if (bb_is_full && bb_is_aligned) if (os && bb_is_aligned) - *os << "Note: '" << name << "' domain" - " has one or more sizes that are not vector-cluster multiples;" - " masked calculations will be used in peel and remainder sub-blocks.\n"; + *os << "Note: '" << name << "' domain" + " has one or more sizes that are not vector-cluster multiples;" + " masked calculations will be used in peel and remainder sub-blocks.\n"; bb_is_cluster_mult = false; break; } diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index 328d966e..d7aa47c7 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -62,20 +62,16 @@ namespace yask { // Trim the default block indices based on the bounding box(es) // for this bundle. - // TODO: replace string-based lookup w/indices. ScanIndices bb_idxs(def_block_idxs); - for (int i = 0; i < nsdims; i++) { + for (int i = 0, j = 0; i < nsdims; i++) { if (i == step_posn) continue; - auto& dname = dims->_stencil_dims.getDimName(i); // Begin point. - assert(bb.bb_begin.lookup(dname)); - auto bbegin = max(def_block_idxs.begin[i], bb.bb_begin[dname]); + auto bbegin = max(def_block_idxs.begin[i], bb.bb_begin[j]); bb_idxs.begin[i] = bbegin; // End point. - assert(bb.bb_end.lookup(dname)); - auto bend = min(def_block_idxs.end[i], bb.bb_end[dname]); + auto bend = min(def_block_idxs.end[i], bb.bb_end[j]); bb_idxs.end[i] = bend; // Anything to do? @@ -83,6 +79,7 @@ namespace yask { bb_ok = false; break; } + j++; // next domain index. } // nothing to do? @@ -92,42 +89,42 @@ namespace yask { continue; // to next BB. } - TRACE_MSG3("calc_block for bundle '" << get_name() << - "': after trimming for BB " << bbn << ": " << - bb_idxs.begin.makeValStr(nsdims) << - " ... (end before) " << bb_idxs.end.makeValStr(nsdims)); - - // Update offsets of scratch grids based on this bundle's location. - _generic_context->update_scratch_grid_info(thread_idx, bb_idxs.begin); - - // Get the bundles that need to be processed in - // this block. This will be any prerequisite scratch-grid - // bundles plus this non-scratch bundle. - auto sg_list = get_reqd_bundles(); - - // Set number of threads for a block. - // Each of these threads will work on a sub-block. - // This should be nested within a top-level OpenMP task. - _generic_context->set_block_threads(); - - // Loop through all the needed bundles. - for (auto* sg : sg_list) { - - // Indices needed for the generated loops. Will normally be a - // copy of 'bb_idxs' except when updating scratch-grids. - ScanIndices block_idxs = sg->adjust_span(thread_idx, bb_idxs); - - TRACE_MSG3("calc_block for bundle '" << get_name() << "': " << - " in reqd bundle '" << sg->get_name() << "': " << - block_idxs.begin.makeValStr(nsdims) << - " ... (end before) " << block_idxs.end.makeValStr(nsdims) << - " by thread " << thread_idx); - - // Include automatically-generated loop code that calls - // calc_sub_block() for each sub-block in this block. This - // code typically contains the nested OpenMP loop(s). + TRACE_MSG3("calc_block for bundle '" << get_name() << + "': after trimming for BB " << bbn << ": " << + bb_idxs.begin.makeValStr(nsdims) << + " ... (end before) " << bb_idxs.end.makeValStr(nsdims)); + + // Update offsets of scratch grids based on this bundle's location. + _generic_context->update_scratch_grid_info(thread_idx, bb_idxs.begin); + + // Get the bundles that need to be processed in + // this block. This will be any prerequisite scratch-grid + // bundles plus this non-scratch bundle. + auto sg_list = get_reqd_bundles(); + + // Set number of threads for a block. + // Each of these threads will work on a sub-block. + // This should be nested within a top-level OpenMP task. + _generic_context->set_block_threads(); + + // Loop through all the needed bundles. + for (auto* sg : sg_list) { + + // Indices needed for the generated loops. Will normally be a + // copy of 'bb_idxs' except when updating scratch-grids. + ScanIndices block_idxs = sg->adjust_span(thread_idx, bb_idxs); + + TRACE_MSG3("calc_block for bundle '" << get_name() << "': " << + " in reqd bundle '" << sg->get_name() << "': " << + block_idxs.begin.makeValStr(nsdims) << + " ... (end before) " << block_idxs.end.makeValStr(nsdims) << + " by thread " << thread_idx); + + // Include automatically-generated loop code that calls + // calc_sub_block() for each sub-block in this block. This + // code typically contains the nested OpenMP loop(s). #include "yask_block_loops.hpp" - } + } } // BB list. } @@ -182,10 +179,10 @@ namespace yask { /* Indices in each domain dim: - sub_block_eidxs.begin rem_masks used here - |peel_masks used here | sub_block_eidxs.end - || | | - vv v v + sub_block_eidxs.begin rem_masks used here + |peel_masks used here | sub_block_eidxs.end + || | | + vv v v |---|-------|---------------------------|---|---| <- "|" on vec boundaries. ^ ^ ^ ^ ^ ^ | | | | | | @@ -251,158 +248,158 @@ namespace yask { // Determine the subset of this sub-block that is // clusters, vectors, and partial vectors. #else - do_clusters = true; - do_vectors = false; - do_scalars = false; + do_clusters = true; + do_vectors = false; + do_scalars = false; - // i: index for stencil dims, j: index for domain dims. - for (int i = 0, j = 0; i < nsdims; i++) { - if (i != step_posn) { - - // Rank offset. - auto rofs = cp->rank_domain_offsets[j]; - - // Begin/end of rank-relative scalar elements in this dim. - auto ebgn = sub_block_idxs.begin[i] - rofs; - auto eend = sub_block_idxs.end[i] - rofs; - sub_block_eidxs.begin[i] = ebgn; - sub_block_eidxs.end[i] = eend; - - // Find range of full clusters. - // Note that fcend <= eend because we round - // down to get whole clusters only. - // Similarly, fcbgn >= ebgn. - auto cpts = dims->_cluster_pts[j]; - auto fcbgn = round_up_flr(ebgn, cpts); - auto fcend = round_down_flr(eend, cpts); - sub_block_fcidxs.begin[i] = fcbgn; - sub_block_fcidxs.end[i] = fcend; - - // Any clusters to do? - if (fcend <= fcbgn) - do_clusters = false; - - // If anything before or after clusters, continue with - // setting vector indices and peel/rem masks. - if (fcbgn > ebgn || fcend < eend) { - - // Find range of full and/or partial vectors. - // Note that fvend <= eend because we round - // down to get whole vectors only. - // Note that vend >= eend because we round - // up to include partial vectors. - // Similar but opposite for begin vars. - // We make a vector mask to pick the - // right elements. - // TODO: use compile-time consts instead - // of _fold_pts for more efficiency. - auto vpts = dims->_fold_pts[j]; - auto fvbgn = round_up_flr(ebgn, vpts); - auto fvend = round_down_flr(eend, vpts); - auto vbgn = round_down_flr(ebgn, vpts); - auto vend = round_up_flr(eend, vpts); - if (i == _inner_posn) { - - // Don't do any full and/or partial vectors in - // plane of inner dim. We'll do these with - // scalars. This is unusual because vector - // folding is normally done in a plane - // perpendicular to the inner dim for >= 2D - // domains. - fvbgn = vbgn = fcbgn; - fvend = vend = fcend; - } - sub_block_fvidxs.begin[i] = fvbgn; - sub_block_fvidxs.end[i] = fvend; - sub_block_vidxs.begin[i] = vbgn; - sub_block_vidxs.end[i] = vend; - - // Any vectors to do (full and/or partial)? - if (vbgn < fcbgn || vend > fcend) - do_vectors = true; - - // Calculate masks in this dim for partial vectors. - // All such masks will be ANDed together to form the - // final masks over all domain dims. - // Example: assume folding is x=4*y=4. - // Possible 'x' peel mask to exclude 1st 2 cols: - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // Possible 'y' peel mask to exclude 1st row: - // 0 0 0 0 - // 1 1 1 1 - // 1 1 1 1 - // 1 1 1 1 - // Along 'x' face, the 'x' peel mask is used. - // Along 'y' face, the 'y' peel mask is used. - // Along an 'x-y' edge, they are ANDed to make this mask: - // 0 0 0 0 - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // so that the 6 corner elements are updated. - - if (vbgn < fvbgn || vend > fvend) { - idx_t pmask = 0, rmask = 0; - - // Need to set upper bit. - idx_t mbit = 0x1 << (dims->_fold_pts.product() - 1); - - // Visit points in a vec-fold. - dims->_fold_pts.visitAllPoints - ([&](const IdxTuple& pt, size_t idx) { - - // Shift masks to next posn. - pmask >>= 1; - rmask >>= 1; - - // If the peel point is within the sub-block, - // set the next bit in the mask. - idx_t pi = vbgn + pt[j]; - if (pi >= ebgn) - pmask |= mbit; - - // If the rem point is within the sub-block, - // put a 1 in the mask. - pi = fvend + pt[j]; - if (pi < eend) - rmask |= mbit; - - // Keep visiting. - return true; - }); - - // Save masks in this dim. - peel_masks[i] = pmask; - rem_masks[i] = rmask; - } - - // Anything not covered? - // This will only be needed in inner dim because we - // will do partial vectors in other dims. - // Set 'scalar_for_peel_rem' to indicate we only want to - // do peel and/or rem in scalar loop. - if (i == _inner_posn && (ebgn < vbgn || eend > vend)) { - do_scalars = true; - scalar_for_peel_rem = true; - } + // i: index for stencil dims, j: index for domain dims. + for (int i = 0, j = 0; i < nsdims; i++) { + if (i != step_posn) { + + // Rank offset. + auto rofs = cp->rank_domain_offsets[j]; + + // Begin/end of rank-relative scalar elements in this dim. + auto ebgn = sub_block_idxs.begin[i] - rofs; + auto eend = sub_block_idxs.end[i] - rofs; + sub_block_eidxs.begin[i] = ebgn; + sub_block_eidxs.end[i] = eend; + + // Find range of full clusters. + // Note that fcend <= eend because we round + // down to get whole clusters only. + // Similarly, fcbgn >= ebgn. + auto cpts = dims->_cluster_pts[j]; + auto fcbgn = round_up_flr(ebgn, cpts); + auto fcend = round_down_flr(eend, cpts); + sub_block_fcidxs.begin[i] = fcbgn; + sub_block_fcidxs.end[i] = fcend; + + // Any clusters to do? + if (fcend <= fcbgn) + do_clusters = false; + + // If anything before or after clusters, continue with + // setting vector indices and peel/rem masks. + if (fcbgn > ebgn || fcend < eend) { + + // Find range of full and/or partial vectors. + // Note that fvend <= eend because we round + // down to get whole vectors only. + // Note that vend >= eend because we round + // up to include partial vectors. + // Similar but opposite for begin vars. + // We make a vector mask to pick the + // right elements. + // TODO: use compile-time consts instead + // of _fold_pts for more efficiency. + auto vpts = dims->_fold_pts[j]; + auto fvbgn = round_up_flr(ebgn, vpts); + auto fvend = round_down_flr(eend, vpts); + auto vbgn = round_down_flr(ebgn, vpts); + auto vend = round_up_flr(eend, vpts); + if (i == _inner_posn) { + + // Don't do any full and/or partial vectors in + // plane of inner dim. We'll do these with + // scalars. This is unusual because vector + // folding is normally done in a plane + // perpendicular to the inner dim for >= 2D + // domains. + fvbgn = vbgn = fcbgn; + fvend = vend = fcend; + } + sub_block_fvidxs.begin[i] = fvbgn; + sub_block_fvidxs.end[i] = fvend; + sub_block_vidxs.begin[i] = vbgn; + sub_block_vidxs.end[i] = vend; + + // Any vectors to do (full and/or partial)? + if (vbgn < fcbgn || vend > fcend) + do_vectors = true; + + // Calculate masks in this dim for partial vectors. + // All such masks will be ANDed together to form the + // final masks over all domain dims. + // Example: assume folding is x=4*y=4. + // Possible 'x' peel mask to exclude 1st 2 cols: + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // Possible 'y' peel mask to exclude 1st row: + // 0 0 0 0 + // 1 1 1 1 + // 1 1 1 1 + // 1 1 1 1 + // Along 'x' face, the 'x' peel mask is used. + // Along 'y' face, the 'y' peel mask is used. + // Along an 'x-y' edge, they are ANDed to make this mask: + // 0 0 0 0 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // so that the 6 corner elements are updated. + + if (vbgn < fvbgn || vend > fvend) { + idx_t pmask = 0, rmask = 0; + + // Need to set upper bit. + idx_t mbit = 0x1 << (dims->_fold_pts.product() - 1); + + // Visit points in a vec-fold. + dims->_fold_pts.visitAllPoints + ([&](const IdxTuple& pt, size_t idx) { + + // Shift masks to next posn. + pmask >>= 1; + rmask >>= 1; + + // If the peel point is within the sub-block, + // set the next bit in the mask. + idx_t pi = vbgn + pt[j]; + if (pi >= ebgn) + pmask |= mbit; + + // If the rem point is within the sub-block, + // put a 1 in the mask. + pi = fvend + pt[j]; + if (pi < eend) + rmask |= mbit; + + // Keep visiting. + return true; + }); + + // Save masks in this dim. + peel_masks[i] = pmask; + rem_masks[i] = rmask; } - // If no peel or rem, just set vec indices to same as - // full cluster. - else { - sub_block_fvidxs.begin[i] = fcbgn; - sub_block_fvidxs.end[i] = fcend; - sub_block_vidxs.begin[i] = fcbgn; - sub_block_vidxs.end[i] = fcend; + // Anything not covered? + // This will only be needed in inner dim because we + // will do partial vectors in other dims. + // Set 'scalar_for_peel_rem' to indicate we only want to + // do peel and/or rem in scalar loop. + if (i == _inner_posn && (ebgn < vbgn || eend > vend)) { + do_scalars = true; + scalar_for_peel_rem = true; } + } - // Next domain index. - j++; + // If no peel or rem, just set vec indices to same as + // full cluster. + else { + sub_block_fvidxs.begin[i] = fcbgn; + sub_block_fvidxs.end[i] = fcend; + sub_block_vidxs.begin[i] = fcbgn; + sub_block_vidxs.end[i] = fcend; } + + // Next domain index. + j++; } + } #endif // Normalized indices needed for sub-block loop. @@ -435,7 +432,7 @@ namespace yask { // Define the function called from the generated loops // to simply call the loop-of-clusters functions. -#define calc_inner_loop(thread_idx, loop_idxs) \ +#define calc_inner_loop(thread_idx, loop_idxs) \ calc_loop_of_clusters(thread_idx, loop_idxs) // Include automatically-generated loop code that calls @@ -492,7 +489,7 @@ namespace yask { // See the mask diagrams above that show how the // masks are ANDed together. // Since step is always 1, we ignore loop_idxs.stop. -#define calc_inner_loop(thread_idx, loop_idxs) \ +#define calc_inner_loop(thread_idx, loop_idxs) \ bool ok = false; \ idx_t mask = idx_t(-1); \ for (int i = 0; i < nsdims; i++) { \ @@ -500,11 +497,11 @@ namespace yask { i != _inner_posn && \ (loop_idxs.start[i] < norm_sub_block_fcidxs.begin[i] || \ loop_idxs.start[i] >= norm_sub_block_fcidxs.end[i])) { \ - ok = true; \ - if (loop_idxs.start[i] < norm_sub_block_fvidxs.begin[i]) \ - mask &= peel_masks[i]; \ - if (loop_idxs.start[i] >= norm_sub_block_fvidxs.end[i]) \ - mask &= rem_masks[i]; \ + ok = true; \ + if (loop_idxs.start[i] < norm_sub_block_fvidxs.begin[i]) \ + mask &= peel_masks[i]; \ + if (loop_idxs.start[i] >= norm_sub_block_fvidxs.end[i]) \ + mask &= rem_masks[i]; \ } \ } \ if (ok) calc_loop_of_vectors(thread_idx, loop_idxs, mask); @@ -564,7 +561,7 @@ namespace yask { // Make sure streaming stores are visible for later loads. make_stores_visible(); - + } // calc_sub_block. // Calculate a series of cluster results within an inner loop. @@ -572,7 +569,7 @@ namespace yask { // Indices must be rank-relative. // Indices must be normalized, i.e., already divided by VLEN_*. void StencilBundleBase::calc_loop_of_clusters(int thread_idx, - const ScanIndices& loop_idxs) { + const ScanIndices& loop_idxs) { auto* cp = _generic_context; auto& dims = cp->get_dims(); int nsdims = dims->_stencil_dims.size(); @@ -608,8 +605,8 @@ namespace yask { // Indices must be rank-relative. // Indices must be normalized, i.e., already divided by VLEN_*. void StencilBundleBase::calc_loop_of_vectors(int thread_idx, - const ScanIndices& loop_idxs, - idx_t write_mask) { + const ScanIndices& loop_idxs, + idx_t write_mask) { auto* cp = _generic_context; auto& dims = cp->get_dims(); int nsdims = dims->_stencil_dims.size(); @@ -649,8 +646,9 @@ namespace yask { // its halo sizes are still used to specify how much to // add to 'idxs'. // Return adjusted indices. - ScanIndices StencilBundleBase::adjust_span(int thread_idx, const ScanIndices& idxs) const { - + ScanIndices StencilBundleBase::adjust_span(int thread_idx, + const ScanIndices& idxs) const { + ScanIndices adj_idxs(idxs); auto* cp = _generic_context; auto& dims = cp->get_dims(); @@ -676,16 +674,16 @@ namespace yask { int posn = gp->get_dim_posn(dname); if (posn >= 0) { - // Make sure grid domain covers block. - assert(idxs.begin[i] >= gp->get_first_rank_domain_index(posn)); - assert(idxs.end[i] <= gp->get_last_rank_domain_index(posn) + 1); - // Adjust begin & end scan indices based on halos. idx_t lh = gp->get_left_halo_size(posn); idx_t rh = gp->get_right_halo_size(posn); adj_idxs.begin[i] = idxs.begin[i] - lh; adj_idxs.end[i] = idxs.end[i] + rh; + // Make sure grid covers block. + assert(adj_idxs.begin[i] >= gp->get_first_rank_alloc_index(posn)); + assert(adj_idxs.end[i] <= gp->get_last_rank_alloc_index(posn) + 1); + // If existing step is >= whole tile, adjust it also. idx_t width = idxs.end[i] - idxs.begin[i]; if (idxs.step[i] >= width) { diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp index b10e5a98..e562f00e 100644 --- a/src/kernel/lib/stencil_calc.hpp +++ b/src/kernel/lib/stencil_calc.hpp @@ -221,12 +221,17 @@ namespace yask { }; // StencilBundleBase. // A collection of independent stencil bundles. + // "Independent" implies that they may be evaluated + // in any order. class BundlePack : public std::vector { protected: std::string _name; + // Union of bounding boxes for all bundles. + BoundingBox _pack_bb; + public: BundlePack(const std::string& name) : _name(name) { } @@ -236,6 +241,9 @@ namespace yask { return _name; } + // Access to BB. + virtual BoundingBox& getBB() { return _pack_bb; } + }; // BundlePack. } // yask namespace. diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh index 372bd585..177051de 100755 --- a/src/kernel/yask.sh +++ b/src/kernel/yask.sh @@ -271,6 +271,20 @@ fi echo "Log saved in '$logfile'." -if [[ `grep -c FAILED $logfile` > 0 ]]; then +# Checks for issues. +exe_str="'$mpi_cmd $exe_prefix $exe $opts $@'" + +# Return a non-zero exit condition if test failed. +if [[ `grep -c 'TEST FAILED' $logfile` > 0 ]]; then + echo $exe_str did not pass internal validation test. exit 1; fi + +# Return a non-zero exit condition if executable didn't exit cleanly. +if [[ `grep -c 'YASK DONE' $logfile` == 0 ]]; then + echo $exe_str did not exit cleanly. + exit 1; +fi + +echo $exe_str ran successfully. +exit 0; diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp index ca30aa41..dca6dbf0 100644 --- a/src/kernel/yask_main.cpp +++ b/src/kernel/yask_main.cpp @@ -256,13 +256,15 @@ int main(int argc, char** argv) auto ksoln = kfac.new_solution(kenv); auto context = dynamic_pointer_cast(ksoln); assert(context.get()); + + // Replace the default settings with 'opts'. context->set_settings(opts); - ostream& os = context->set_ostr(); // Make sure any MPI/OMP debug data is dumped from all ranks before continuing. kenv->global_barrier(); // Print splash banner and related info. + ostream& os = context->set_ostr(); opts->splash(os, argc, argv); // Override alloc if requested. @@ -379,6 +381,7 @@ int main(int argc, char** argv) ref_context->name += "-reference"; ref_context->allow_vec_exchange = false; // exchange scalars in halos. + // Override allocations and prep solution as with ref soln. alloc_steps(ref_soln, *opts); ref_soln->prepare_solution();