Skip to content

Commit

Permalink
Mark grids dirty in a rank if *any* rank could make it dirty. Closes #…
Browse files Browse the repository at this point in the history
  • Loading branch information
chuckyount committed Apr 28, 2018
1 parent d1c8d0c commit eb8fcf2
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 67 deletions.
41 changes: 28 additions & 13 deletions src/kernel/lib/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,13 @@ namespace yask {
#include "yask_misc_loops.hpp"
#undef misc_fn

// Remember grids that have been written to by this bundle,
// Mark grids that [may] have been written to by this bundle,
// updated at next step (+/- 1).
mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg);
// Mark grids as dirty even if not actually written by this
// rank. This is needed because neighbors will not know what
// grids are actually dirty, and all ranks must have the same
// information about which grids are possibly dirty.
mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);

} // needed bundles.
} // all bundles.
Expand Down Expand Up @@ -436,9 +440,9 @@ namespace yask {
}

// If doing wave-fronts, must loop through all bundles in
// calc_region().
// TODO: make this the only case, allowing all bundles to be done
// between MPI exchanges, even w/o wave-fronts.
// calc_region(). TODO: consider making this the only case,
// allowing all bundles to be done between MPI exchanges, even
// w/o wave-fronts.
else {

// Exchange all dirty halo(s).
Expand Down Expand Up @@ -606,13 +610,18 @@ namespace yask {
// similar for y and z. This code typically
// contains the outer OpenMP loop(s).
#include "yask_region_loops.hpp"

// Remember grids that have been written to by this bundle,
// updated at next step (+/- 1).
mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
}

// Shift spatial region boundaries for next iteration to
// Mark grids that [may] have been written to by this bundle,
// updated at next step (+/- 1).
// Mark grids as dirty even if not actually written by this
// rank. This is needed because neighbors will not know what
// grids are actually dirty, and all ranks must have the same
// information about which grids are possibly dirty.
// TODO: make this smarter.
mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);

// Shift spatial region boundaries for next iteration to
// implement temporal wavefront. Between regions, we only shift
// backward, so region loops must strictly increment. They may do
// so in any order. TODO: shift only what is needed by
Expand Down Expand Up @@ -1184,7 +1193,7 @@ namespace yask {
if (!gp->is_dirty(t))
continue;

// Only need to swap grids that have MPI buffers.
// Only need to swap grids that have any MPI buffers.
auto& gname = gp->get_name();
if (mpiData.count(gname) == 0)
continue;
Expand Down Expand Up @@ -1226,7 +1235,7 @@ namespace yask {
auto gp = gtsi.second;
gi++;
MPI_Request* grid_recv_reqs = recv_reqs[gi];
TRACE_MSG(" for grid '" << gname << "'...");
TRACE_MSG(" for grid #" << gi << ", '" << gname << "'...");

// Visit all this rank's neighbors.
auto& grid_mpi_data = mpiData.at(gname);
Expand Down Expand Up @@ -1260,6 +1269,8 @@ namespace yask {
neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]);
num_recv_reqs++;
}
else
TRACE_MSG(" 0B to request");
}

// Pack data into send buffer, then send to neighbor.
Expand All @@ -1276,7 +1287,7 @@ namespace yask {
IdxTuple first = sendBuf.begin_pt;
IdxTuple last = sendBuf.last_pt;

// The code in allocData() pre-calculated the first and
// The code in allocMpiData() pre-calculated the first and
// last points of each buffer, except in the step dim.
// So, we need to set that value now.
// TODO: update this if we expand the buffers to hold
Expand Down Expand Up @@ -1305,6 +1316,8 @@ namespace yask {
neighbor_rank, int(gi), _env->comm,
&send_reqs[num_send_reqs++]);
}
else
TRACE_MSG(" 0B to send");
}

// Wait for data from neighbor, then unpack it.
Expand Down Expand Up @@ -1343,6 +1356,8 @@ namespace yask {
n = gp->set_elements_in_slice(buf, first, last);
assert(n == recvBuf.get_size());
}
else
TRACE_MSG(" 0B to wait for");
}
}); // visit neighbors.

Expand Down
127 changes: 73 additions & 54 deletions src/kernel/lib/setup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,15 +328,16 @@ namespace yask {
} // grid passes.
};

// Create MPI and allocate buffers.
// Create MPI buffers and allocate them.
void StencilContext::allocMpiData(ostream& os) {

// Remove any old MPI data.
freeMpiData(os);

#ifdef USE_MPI

int num_exchanges = 0;
map<int, int> num_exchanges; // send/recv => count.
map<int, idx_t> num_elems; // send/recv => count.
auto me = _env->my_rank;

// Need to determine the size and shape of all MPI buffers.
Expand Down Expand Up @@ -369,10 +370,24 @@ namespace yask {
return; // from lambda fn.
}

// Determine size of MPI buffers between neigh_rank and my rank
// for each grid and create those that are needed.
// Is vectorized exchange allowed based on domain sizes?
// Both my rank and neighbor rank must have all domain sizes
// of vector multiples.
bool vec_ok = allow_vec_exchange &&
_mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
_mpiInfo->has_all_vlen_mults[neigh_idx];

// Determine size of MPI buffers between neigh_rank and my
// rank for each grid and create those that are needed. It
// is critical that the number, size, and shape of my
// send/receive buffers match those of the receive/send
// buffers of my neighbors. Important: Current algorithm
// assumes my left neighbor's buffer sizes can be calculated
// by considering my rank's right side data and vice-versa.
// Thus, all ranks must have consistent data that contribute
// to these calculations.
for (auto gp : gridPtrs) {
if (!gp)
if (!gp || gp->is_scratch() || gp->is_fixed_size())
continue;
auto& gname = gp->get_name();

Expand All @@ -384,12 +399,15 @@ namespace yask {
IdxTuple first_outer_idx, last_outer_idx;
for (auto& dim : _dims->_domain_dims.getDims()) {
auto& dname = dim.getName();

// Only consider domain dims that are used in this grid.
if (gp->is_dim_used(dname)) {

// Get domain indices for this grid.
// If there are no more ranks in the given direction, extend
// the index into the outer halo to make sure all data are sync'd.
// This is critical for WFs.
// Get domain indices for this grid. If there
// are no more ranks in the given direction,
// extend the "outer" index to include the halo
// in that direction to make sure all data are
// sync'd. This is critical for WFs.
idx_t fidx = gp->get_first_rank_domain_index(dname);
idx_t lidx = gp->get_last_rank_domain_index(dname);
first_inner_idx.addDimBack(dname, fidx);
Expand All @@ -401,68 +419,66 @@ namespace yask {
first_outer_idx.addDimBack(dname, fidx);
last_outer_idx.addDimBack(dname, lidx);

// Determine size of exchange. This will be the actual halo size
// plus any wave-front extensions. In the current implementation,
// we need the wave-front extensions regardless of whether there
// is a halo on a given grid. This is because each stencil-bundle
// gets shifted by the WF angles at each step in the WF.
// Determine size of exchange in this dim. This
// will be the actual halo size plus any
// wave-front shifts. In the current
// implementation, we need the wave-front shifts
// regardless of whether there is a halo on a
// given grid. This is because each
// stencil-bundle gets shifted by the WF angles
// at each step in the WF.

// Neighbor is to the left.
// Neighbor is to the left in this dim.
if (neigh_offsets[dname] == MPIInfo::rank_prev) {
auto ext = left_wf_exts[dname];
auto ext = wf_shifts[dname];

// my halo.
// my halo on my left.
auto halo_size = gp->get_left_halo_size(dname);
halo_size += ext;
my_halo_sizes.addDimBack(dname, halo_size);

// neighbor halo.
halo_size = gp->get_right_halo_size(dname); // their right is on my left.
// neighbor halo on their right.
halo_size = gp->get_right_halo_size(dname); // assume their right == my right.
halo_size += ext;
neigh_halo_sizes.addDimBack(dname, halo_size);

// Flag that this grid has a neighbor to left or right.
found_delta = true;
}

// Neighbor is to the right.
// Neighbor is to the right in this dim.
else if (neigh_offsets[dname] == MPIInfo::rank_next) {
auto ext = right_wf_exts[dname];
auto ext = wf_shifts[dname];

// my halo.
// my halo on my right.
auto halo_size = gp->get_right_halo_size(dname);
halo_size += ext;
my_halo_sizes.addDimBack(dname, halo_size);

// neighbor halo.
halo_size = gp->get_left_halo_size(dname); // their left is on my right.
// neighbor halo on their left.
halo_size = gp->get_left_halo_size(dname); // assume their left == my left.
halo_size += ext;
neigh_halo_sizes.addDimBack(dname, halo_size);

// Flag that this grid has a neighbor to left or right.
found_delta = true;
}

// Neighbor in-line.
// Neighbor in-line in this dim.
else {
my_halo_sizes.addDimBack(dname, 0);
neigh_halo_sizes.addDimBack(dname, 0);
}

// Vectorized exchange allowed based on domain sizes?
// Both my rank and neighbor rank must have all domain sizes
// of vector multiples.
bool vec_ok = allow_vec_exchange &&
_mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
_mpiInfo->has_all_vlen_mults[neigh_idx];

// Round up halo sizes if vectorized exchanges allowed.
// TODO: add a heuristic to avoid increasing by a large factor.
if (vec_ok) {
auto vec_size = _dims->_fold_pts[dname];
my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
}

// Is this neighbor before or after me in this domain direction?
if (neigh_offsets[dname] != MPIInfo::rank_self)
found_delta = true;
}
}
} // domain dims in this grid.
} // domain dims.

// Is buffer needed?
// Example: if this grid is 2D in y-z, but only neighbors are in
Expand Down Expand Up @@ -589,11 +605,19 @@ namespace yask {

} // all dims in this grid.

// Unique name for buffer based on grid name, direction, and ranks.
ostringstream oss;
oss << gname;
if (bd == MPIBufs::bufSend)
oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
else if (bd == MPIBufs::bufRecv)
oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
string bufname = oss.str();

// Does buffer have non-zero size?
if (buf_sizes.size() == 0 || buf_sizes.product() == 0) {
TRACE_MSG("no halo exchange needed for grid '" << gname <<
"' with rank " << neigh_rank <<
" because there is no data to exchange");
TRACE_MSG("MPI buffer '" << bufname <<
"' not needed because there is no data to exchange");
continue;
}

Expand All @@ -602,15 +626,6 @@ namespace yask {
// Convert end to last.
IdxTuple copy_last = copy_end.subElements(1);

// Unique name for buffer based on grid name, direction, and ranks.
ostringstream oss;
oss << gname;
if (bd == MPIBufs::bufSend)
oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
else if (bd == MPIBufs::bufRecv)
oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
string bufname = oss.str();

// Make MPI data entry for this grid.
auto gbp = mpiData.emplace(gname, _mpiInfo);
auto& gbi = gbp.first; // iterator from pair returned by emplace().
Expand All @@ -625,18 +640,22 @@ namespace yask {
buf.name = bufname;
buf.has_all_vlen_mults = vlen_mults;

TRACE_MSG("configured MPI buffer object '" << buf.name <<
"' for rank at relative offsets " <<
TRACE_MSG("MPI buffer '" << buf.name <<
"' configured for rank at relative offsets " <<
neigh_offsets.subElements(1).makeDimValStr() << " with " <<
buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() <<
" element(s) at " << buf.begin_pt.makeDimValStr() <<
" ... " << buf.last_pt.makeDimValStr());
num_exchanges++;
num_exchanges[bd]++;
num_elems[bd] += buf.get_size();

} // send, recv.
} // grids.
}); // neighbors.
TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges);
TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::bufSend)]);
TRACE_MSG("number of elements in send buffers: " << makeNumStr(num_elems[int(MPIBufs::bufSend)]));
TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::bufRecv)]);
TRACE_MSG("number of elements in recv buffers: " << makeNumStr(num_elems[int(MPIBufs::bufRecv)]));

// Base ptrs for all alloc'd data.
// These pointers will be shared by the ones in the grid
Expand Down

0 comments on commit eb8fcf2

Please sign in to comment.