From eb8fcf2d364edaceb3c13f57087e3f106780d182 Mon Sep 17 00:00:00 2001 From: Chuck Yount Date: Sat, 28 Apr 2018 11:09:58 -0700 Subject: [PATCH] Mark grids dirty in a rank if *any* rank could make it dirty. Closes #106. --- src/kernel/lib/context.cpp | 41 ++++++++---- src/kernel/lib/setup.cpp | 127 +++++++++++++++++++++---------------- 2 files changed, 101 insertions(+), 67 deletions(-) diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 668696d1..7c521bf0 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -271,9 +271,13 @@ namespace yask { #include "yask_misc_loops.hpp" #undef misc_fn - // Remember grids that have been written to by this bundle, + // Mark grids that [may] have been written to by this bundle, // updated at next step (+/- 1). - mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg); + // Mark grids as dirty even if not actually written by this + // rank. This is needed because neighbors will not know what + // grids are actually dirty, and all ranks must have the same + // information about which grids are possibly dirty. + mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); } // needed bundles. } // all bundles. @@ -436,9 +440,9 @@ namespace yask { } // If doing wave-fronts, must loop through all bundles in - // calc_region(). - // TODO: make this the only case, allowing all bundles to be done - // between MPI exchanges, even w/o wave-fronts. + // calc_region(). TODO: consider making this the only case, + // allowing all bundles to be done between MPI exchanges, even + // w/o wave-fronts. else { // Exchange all dirty halo(s). @@ -606,13 +610,18 @@ namespace yask { // similar for y and z. This code typically // contains the outer OpenMP loop(s). #include "yask_region_loops.hpp" - - // Remember grids that have been written to by this bundle, - // updated at next step (+/- 1). - mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); } - // Shift spatial region boundaries for next iteration to + // Mark grids that [may] have been written to by this bundle, + // updated at next step (+/- 1). + // Mark grids as dirty even if not actually written by this + // rank. This is needed because neighbors will not know what + // grids are actually dirty, and all ranks must have the same + // information about which grids are possibly dirty. + // TODO: make this smarter. + mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg); + + // Shift spatial region boundaries for next iteration to // implement temporal wavefront. Between regions, we only shift // backward, so region loops must strictly increment. They may do // so in any order. TODO: shift only what is needed by @@ -1184,7 +1193,7 @@ namespace yask { if (!gp->is_dirty(t)) continue; - // Only need to swap grids that have MPI buffers. + // Only need to swap grids that have any MPI buffers. auto& gname = gp->get_name(); if (mpiData.count(gname) == 0) continue; @@ -1226,7 +1235,7 @@ namespace yask { auto gp = gtsi.second; gi++; MPI_Request* grid_recv_reqs = recv_reqs[gi]; - TRACE_MSG(" for grid '" << gname << "'..."); + TRACE_MSG(" for grid #" << gi << ", '" << gname << "'..."); // Visit all this rank's neighbors. auto& grid_mpi_data = mpiData.at(gname); @@ -1260,6 +1269,8 @@ namespace yask { neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]); num_recv_reqs++; } + else + TRACE_MSG(" 0B to request"); } // Pack data into send buffer, then send to neighbor. @@ -1276,7 +1287,7 @@ namespace yask { IdxTuple first = sendBuf.begin_pt; IdxTuple last = sendBuf.last_pt; - // The code in allocData() pre-calculated the first and + // The code in allocMpiData() pre-calculated the first and // last points of each buffer, except in the step dim. // So, we need to set that value now. // TODO: update this if we expand the buffers to hold @@ -1305,6 +1316,8 @@ namespace yask { neighbor_rank, int(gi), _env->comm, &send_reqs[num_send_reqs++]); } + else + TRACE_MSG(" 0B to send"); } // Wait for data from neighbor, then unpack it. @@ -1343,6 +1356,8 @@ namespace yask { n = gp->set_elements_in_slice(buf, first, last); assert(n == recvBuf.get_size()); } + else + TRACE_MSG(" 0B to wait for"); } }); // visit neighbors. diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index 58fb63ef..d4aacc8f 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -328,7 +328,7 @@ namespace yask { } // grid passes. }; - // Create MPI and allocate buffers. + // Create MPI buffers and allocate them. void StencilContext::allocMpiData(ostream& os) { // Remove any old MPI data. @@ -336,7 +336,8 @@ namespace yask { #ifdef USE_MPI - int num_exchanges = 0; + map num_exchanges; // send/recv => count. + map num_elems; // send/recv => count. auto me = _env->my_rank; // Need to determine the size and shape of all MPI buffers. @@ -369,10 +370,24 @@ namespace yask { return; // from lambda fn. } - // Determine size of MPI buffers between neigh_rank and my rank - // for each grid and create those that are needed. + // Is vectorized exchange allowed based on domain sizes? + // Both my rank and neighbor rank must have all domain sizes + // of vector multiples. + bool vec_ok = allow_vec_exchange && + _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && + _mpiInfo->has_all_vlen_mults[neigh_idx]; + + // Determine size of MPI buffers between neigh_rank and my + // rank for each grid and create those that are needed. It + // is critical that the number, size, and shape of my + // send/receive buffers match those of the receive/send + // buffers of my neighbors. Important: Current algorithm + // assumes my left neighbor's buffer sizes can be calculated + // by considering my rank's right side data and vice-versa. + // Thus, all ranks must have consistent data that contribute + // to these calculations. for (auto gp : gridPtrs) { - if (!gp) + if (!gp || gp->is_scratch() || gp->is_fixed_size()) continue; auto& gname = gp->get_name(); @@ -384,12 +399,15 @@ namespace yask { IdxTuple first_outer_idx, last_outer_idx; for (auto& dim : _dims->_domain_dims.getDims()) { auto& dname = dim.getName(); + + // Only consider domain dims that are used in this grid. if (gp->is_dim_used(dname)) { - // Get domain indices for this grid. - // If there are no more ranks in the given direction, extend - // the index into the outer halo to make sure all data are sync'd. - // This is critical for WFs. + // Get domain indices for this grid. If there + // are no more ranks in the given direction, + // extend the "outer" index to include the halo + // in that direction to make sure all data are + // sync'd. This is critical for WFs. idx_t fidx = gp->get_first_rank_domain_index(dname); idx_t lidx = gp->get_last_rank_domain_index(dname); first_inner_idx.addDimBack(dname, fidx); @@ -401,55 +419,57 @@ namespace yask { first_outer_idx.addDimBack(dname, fidx); last_outer_idx.addDimBack(dname, lidx); - // Determine size of exchange. This will be the actual halo size - // plus any wave-front extensions. In the current implementation, - // we need the wave-front extensions regardless of whether there - // is a halo on a given grid. This is because each stencil-bundle - // gets shifted by the WF angles at each step in the WF. + // Determine size of exchange in this dim. This + // will be the actual halo size plus any + // wave-front shifts. In the current + // implementation, we need the wave-front shifts + // regardless of whether there is a halo on a + // given grid. This is because each + // stencil-bundle gets shifted by the WF angles + // at each step in the WF. - // Neighbor is to the left. + // Neighbor is to the left in this dim. if (neigh_offsets[dname] == MPIInfo::rank_prev) { - auto ext = left_wf_exts[dname]; + auto ext = wf_shifts[dname]; - // my halo. + // my halo on my left. auto halo_size = gp->get_left_halo_size(dname); halo_size += ext; my_halo_sizes.addDimBack(dname, halo_size); - // neighbor halo. - halo_size = gp->get_right_halo_size(dname); // their right is on my left. + // neighbor halo on their right. + halo_size = gp->get_right_halo_size(dname); // assume their right == my right. halo_size += ext; neigh_halo_sizes.addDimBack(dname, halo_size); + + // Flag that this grid has a neighbor to left or right. + found_delta = true; } - // Neighbor is to the right. + // Neighbor is to the right in this dim. else if (neigh_offsets[dname] == MPIInfo::rank_next) { - auto ext = right_wf_exts[dname]; + auto ext = wf_shifts[dname]; - // my halo. + // my halo on my right. auto halo_size = gp->get_right_halo_size(dname); halo_size += ext; my_halo_sizes.addDimBack(dname, halo_size); - // neighbor halo. - halo_size = gp->get_left_halo_size(dname); // their left is on my right. + // neighbor halo on their left. + halo_size = gp->get_left_halo_size(dname); // assume their left == my left. halo_size += ext; neigh_halo_sizes.addDimBack(dname, halo_size); + + // Flag that this grid has a neighbor to left or right. + found_delta = true; } - // Neighbor in-line. + // Neighbor in-line in this dim. else { my_halo_sizes.addDimBack(dname, 0); neigh_halo_sizes.addDimBack(dname, 0); } - // Vectorized exchange allowed based on domain sizes? - // Both my rank and neighbor rank must have all domain sizes - // of vector multiples. - bool vec_ok = allow_vec_exchange && - _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && - _mpiInfo->has_all_vlen_mults[neigh_idx]; - // Round up halo sizes if vectorized exchanges allowed. // TODO: add a heuristic to avoid increasing by a large factor. if (vec_ok) { @@ -457,12 +477,8 @@ namespace yask { my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size)); neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size)); } - - // Is this neighbor before or after me in this domain direction? - if (neigh_offsets[dname] != MPIInfo::rank_self) - found_delta = true; - } - } + } // domain dims in this grid. + } // domain dims. // Is buffer needed? // Example: if this grid is 2D in y-z, but only neighbors are in @@ -589,11 +605,19 @@ namespace yask { } // all dims in this grid. + // Unique name for buffer based on grid name, direction, and ranks. + ostringstream oss; + oss << gname; + if (bd == MPIBufs::bufSend) + oss << "_send_halo_from_" << me << "_to_" << neigh_rank; + else if (bd == MPIBufs::bufRecv) + oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; + string bufname = oss.str(); + // Does buffer have non-zero size? if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { - TRACE_MSG("no halo exchange needed for grid '" << gname << - "' with rank " << neigh_rank << - " because there is no data to exchange"); + TRACE_MSG("MPI buffer '" << bufname << + "' not needed because there is no data to exchange"); continue; } @@ -602,15 +626,6 @@ namespace yask { // Convert end to last. IdxTuple copy_last = copy_end.subElements(1); - // Unique name for buffer based on grid name, direction, and ranks. - ostringstream oss; - oss << gname; - if (bd == MPIBufs::bufSend) - oss << "_send_halo_from_" << me << "_to_" << neigh_rank; - else if (bd == MPIBufs::bufRecv) - oss << "_recv_halo_from_" << neigh_rank << "_to_" << me; - string bufname = oss.str(); - // Make MPI data entry for this grid. auto gbp = mpiData.emplace(gname, _mpiInfo); auto& gbi = gbp.first; // iterator from pair returned by emplace(). @@ -625,18 +640,22 @@ namespace yask { buf.name = bufname; buf.has_all_vlen_mults = vlen_mults; - TRACE_MSG("configured MPI buffer object '" << buf.name << - "' for rank at relative offsets " << + TRACE_MSG("MPI buffer '" << buf.name << + "' configured for rank at relative offsets " << neigh_offsets.subElements(1).makeDimValStr() << " with " << buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() << " element(s) at " << buf.begin_pt.makeDimValStr() << " ... " << buf.last_pt.makeDimValStr()); - num_exchanges++; + num_exchanges[bd]++; + num_elems[bd] += buf.get_size(); } // send, recv. } // grids. }); // neighbors. - TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges); + TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::bufSend)]); + TRACE_MSG("number of elements in send buffers: " << makeNumStr(num_elems[int(MPIBufs::bufSend)])); + TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::bufRecv)]); + TRACE_MSG("number of elements in recv buffers: " << makeNumStr(num_elems[int(MPIBufs::bufRecv)])); // Base ptrs for all alloc'd data. // These pointers will be shared by the ones in the grid