From eb8fcf2d364edaceb3c13f57087e3f106780d182 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 11:09:58 -0700
Subject: [PATCH] Mark grids dirty in a rank if *any* rank could make it dirty.
 Closes #106.

---
 src/kernel/lib/context.cpp |  41 ++++++++----
 src/kernel/lib/setup.cpp   | 127 +++++++++++++++++++++----------------
 2 files changed, 101 insertions(+), 67 deletions(-)

diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 668696d1..7c521bf0 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -271,9 +271,13 @@ namespace yask {
 #include "yask_misc_loops.hpp"
 #undef misc_fn
                 
-                    // Remember grids that have been written to by this bundle,
+                    // Mark grids that [may] have been written to by this bundle,
                     // updated at next step (+/- 1).
-                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *asg);
+                    // Mark grids as dirty even if not actually written by this
+                    // rank. This is needed because neighbors will not know what
+                    // grids are actually dirty, and all ranks must have the same
+                    // information about which grids are possibly dirty.
+                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 
                 } // needed bundles.
             } // all bundles.
@@ -436,9 +440,9 @@ namespace yask {
             }
 
             // If doing wave-fronts, must loop through all bundles in
-            // calc_region().
-            // TODO: make this the only case, allowing all bundles to be done
-            // between MPI exchanges, even w/o wave-fronts.
+            // calc_region().  TODO: consider making this the only case,
+            // allowing all bundles to be done between MPI exchanges, even
+            // w/o wave-fronts.
             else {
 
                 // Exchange all dirty halo(s).
@@ -606,13 +610,18 @@ namespace yask {
                     // similar for y and z.  This code typically
                     // contains the outer OpenMP loop(s).
 #include "yask_region_loops.hpp"
-
-                    // Remember grids that have been written to by this bundle,
-                    // updated at next step (+/- 1).
-                    mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
                 }
             
-                // Shift spatial region boundaries for next iteration to
+                // Mark grids that [may] have been written to by this bundle,
+                // updated at next step (+/- 1).
+                // Mark grids as dirty even if not actually written by this
+                // rank. This is needed because neighbors will not know what
+                // grids are actually dirty, and all ranks must have the same
+                // information about which grids are possibly dirty.
+                // TODO: make this smarter.
+                mark_grids_dirty(start_t + step_t, stop_t + step_t, *sg);
+
+                    // Shift spatial region boundaries for next iteration to
                 // implement temporal wavefront.  Between regions, we only shift
                 // backward, so region loops must strictly increment. They may do
                 // so in any order.  TODO: shift only what is needed by
@@ -1184,7 +1193,7 @@ namespace yask {
                     if (!gp->is_dirty(t))
                         continue;
 
-                    // Only need to swap grids that have MPI buffers.
+                    // Only need to swap grids that have any MPI buffers.
                     auto& gname = gp->get_name();
                     if (mpiData.count(gname) == 0)
                         continue;
@@ -1226,7 +1235,7 @@ namespace yask {
                     auto gp = gtsi.second;
                     gi++;
                     MPI_Request* grid_recv_reqs = recv_reqs[gi];
-                    TRACE_MSG(" for grid '" << gname << "'...");
+                    TRACE_MSG(" for grid #" << gi << ", '" << gname << "'...");
 
                     // Visit all this rank's neighbors.
                     auto& grid_mpi_data = mpiData.at(gname);
@@ -1260,6 +1269,8 @@ namespace yask {
                                               neighbor_rank, int(gi), _env->comm, &grid_recv_reqs[ni]);
                                     num_recv_reqs++;
                                 }
+                                else
+                                    TRACE_MSG("   0B to request");
                             }
 
                             // Pack data into send buffer, then send to neighbor.
@@ -1276,7 +1287,7 @@ namespace yask {
                                     IdxTuple first = sendBuf.begin_pt;
                                     IdxTuple last = sendBuf.last_pt;
 
-                                    // The code in allocData() pre-calculated the first and
+                                    // The code in allocMpiData() pre-calculated the first and
                                     // last points of each buffer, except in the step dim.
                                     // So, we need to set that value now.
                                     // TODO: update this if we expand the buffers to hold
@@ -1305,6 +1316,8 @@ namespace yask {
                                               neighbor_rank, int(gi), _env->comm,
                                               &send_reqs[num_send_reqs++]);
                                 }
+                                else
+                                    TRACE_MSG("   0B to send");
                             }
 
                             // Wait for data from neighbor, then unpack it.
@@ -1343,6 +1356,8 @@ namespace yask {
                                         n = gp->set_elements_in_slice(buf, first, last);
                                     assert(n == recvBuf.get_size());
                                 }
+                                else
+                                    TRACE_MSG("   0B to wait for");
                             }
                         }); // visit neighbors.
 
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index 58fb63ef..d4aacc8f 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -328,7 +328,7 @@ namespace yask {
         } // grid passes.
     };
     
-    // Create MPI and allocate buffers.
+    // Create MPI buffers and allocate them.
     void StencilContext::allocMpiData(ostream& os) {
 
         // Remove any old MPI data.
@@ -336,7 +336,8 @@ namespace yask {
 
 #ifdef USE_MPI
 
-        int num_exchanges = 0;
+        map<int, int> num_exchanges; // send/recv => count.
+        map<int, idx_t> num_elems; // send/recv => count.
         auto me = _env->my_rank;
         
         // Need to determine the size and shape of all MPI buffers.
@@ -369,10 +370,24 @@ namespace yask {
                     return;     // from lambda fn.
                 }
         
-                // Determine size of MPI buffers between neigh_rank and my rank
-                // for each grid and create those that are needed.
+                // Is vectorized exchange allowed based on domain sizes?
+                // Both my rank and neighbor rank must have all domain sizes
+                // of vector multiples.
+                bool vec_ok = allow_vec_exchange &&
+                    _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
+                    _mpiInfo->has_all_vlen_mults[neigh_idx];
+                
+                // Determine size of MPI buffers between neigh_rank and my
+                // rank for each grid and create those that are needed.  It
+                // is critical that the number, size, and shape of my
+                // send/receive buffers match those of the receive/send
+                // buffers of my neighbors.  Important: Current algorithm
+                // assumes my left neighbor's buffer sizes can be calculated
+                // by considering my rank's right side data and vice-versa.
+                // Thus, all ranks must have consistent data that contribute
+                // to these calculations.
                 for (auto gp : gridPtrs) {
-                    if (!gp)
+                    if (!gp || gp->is_scratch() || gp->is_fixed_size())
                         continue;
                     auto& gname = gp->get_name();
 
@@ -384,12 +399,15 @@ namespace yask {
                     IdxTuple first_outer_idx, last_outer_idx;
                     for (auto& dim : _dims->_domain_dims.getDims()) {
                         auto& dname = dim.getName();
+
+                        // Only consider domain dims that are used in this grid.
                         if (gp->is_dim_used(dname)) {
 
-                            // Get domain indices for this grid.
-                            // If there are no more ranks in the given direction, extend
-                            // the index into the outer halo to make sure all data are sync'd.
-                            // This is critical for WFs.
+                            // Get domain indices for this grid.  If there
+                            // are no more ranks in the given direction,
+                            // extend the "outer" index to include the halo
+                            // in that direction to make sure all data are
+                            // sync'd.  This is critical for WFs.
                             idx_t fidx = gp->get_first_rank_domain_index(dname);
                             idx_t lidx = gp->get_last_rank_domain_index(dname);
                             first_inner_idx.addDimBack(dname, fidx);
@@ -401,55 +419,57 @@ namespace yask {
                             first_outer_idx.addDimBack(dname, fidx);
                             last_outer_idx.addDimBack(dname, lidx);
 
-                            // Determine size of exchange. This will be the actual halo size
-                            // plus any wave-front extensions. In the current implementation,
-                            // we need the wave-front extensions regardless of whether there
-                            // is a halo on a given grid. This is because each stencil-bundle
-                            // gets shifted by the WF angles at each step in the WF.
+                            // Determine size of exchange in this dim. This
+                            // will be the actual halo size plus any
+                            // wave-front shifts. In the current
+                            // implementation, we need the wave-front shifts
+                            // regardless of whether there is a halo on a
+                            // given grid. This is because each
+                            // stencil-bundle gets shifted by the WF angles
+                            // at each step in the WF.
 
-                            // Neighbor is to the left.
+                            // Neighbor is to the left in this dim.
                             if (neigh_offsets[dname] == MPIInfo::rank_prev) {
-                                auto ext = left_wf_exts[dname];
+                                auto ext = wf_shifts[dname];
 
-                                // my halo.
+                                // my halo on my left.
                                 auto halo_size = gp->get_left_halo_size(dname);
                                 halo_size += ext;
                                 my_halo_sizes.addDimBack(dname, halo_size);
 
-                                // neighbor halo.
-                                halo_size = gp->get_right_halo_size(dname); // their right is on my left.
+                                // neighbor halo on their right.
+                                halo_size = gp->get_right_halo_size(dname); // assume their right == my right.
                                 halo_size += ext;
                                 neigh_halo_sizes.addDimBack(dname, halo_size);
+
+                                // Flag that this grid has a neighbor to left or right.
+                                found_delta = true;
                             }
 
-                            // Neighbor is to the right.
+                            // Neighbor is to the right in this dim.
                             else if (neigh_offsets[dname] == MPIInfo::rank_next) {
-                                auto ext = right_wf_exts[dname];
+                                auto ext = wf_shifts[dname];
 
-                                // my halo.
+                                // my halo on my right.
                                 auto halo_size = gp->get_right_halo_size(dname);
                                 halo_size += ext;
                                 my_halo_sizes.addDimBack(dname, halo_size);
 
-                                // neighbor halo.
-                                halo_size = gp->get_left_halo_size(dname); // their left is on my right.
+                                // neighbor halo on their left.
+                                halo_size = gp->get_left_halo_size(dname); // assume their left == my left.
                                 halo_size += ext;
                                 neigh_halo_sizes.addDimBack(dname, halo_size);
+
+                                // Flag that this grid has a neighbor to left or right.
+                                found_delta = true;
                             }
 
-                            // Neighbor in-line.
+                            // Neighbor in-line in this dim.
                             else {
                                 my_halo_sizes.addDimBack(dname, 0);
                                 neigh_halo_sizes.addDimBack(dname, 0);
                             }
 
-                            // Vectorized exchange allowed based on domain sizes?
-                            // Both my rank and neighbor rank must have all domain sizes
-                            // of vector multiples.
-                            bool vec_ok = allow_vec_exchange &&
-                                _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
-                                _mpiInfo->has_all_vlen_mults[neigh_idx];
-                            
                             // Round up halo sizes if vectorized exchanges allowed.
                             // TODO: add a heuristic to avoid increasing by a large factor.
                             if (vec_ok) {
@@ -457,12 +477,8 @@ namespace yask {
                                 my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
                                 neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
                             }
-                            
-                            // Is this neighbor before or after me in this domain direction?
-                            if (neigh_offsets[dname] != MPIInfo::rank_self)
-                                found_delta = true;
-                        }
-                    }
+                        } // domain dims in this grid.
+                    } // domain dims.
 
                     // Is buffer needed?
                     // Example: if this grid is 2D in y-z, but only neighbors are in
@@ -589,11 +605,19 @@ namespace yask {
                                 
                         } // all dims in this grid.
 
+                        // Unique name for buffer based on grid name, direction, and ranks.
+                        ostringstream oss;
+                        oss << gname;
+                        if (bd == MPIBufs::bufSend)
+                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
+                        else if (bd == MPIBufs::bufRecv)
+                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
+                        string bufname = oss.str();
+
                         // Does buffer have non-zero size?
                         if (buf_sizes.size() == 0 || buf_sizes.product() == 0) {
-                            TRACE_MSG("no halo exchange needed for grid '" << gname <<
-                                      "' with rank " << neigh_rank <<
-                                      " because there is no data to exchange");
+                            TRACE_MSG("MPI buffer '" << bufname <<
+                                      "' not needed because there is no data to exchange");
                             continue;
                         }
 
@@ -602,15 +626,6 @@ namespace yask {
                         // Convert end to last.
                         IdxTuple copy_last = copy_end.subElements(1);
 
-                        // Unique name for buffer based on grid name, direction, and ranks.
-                        ostringstream oss;
-                        oss << gname;
-                        if (bd == MPIBufs::bufSend)
-                            oss << "_send_halo_from_" << me << "_to_" << neigh_rank;
-                        else if (bd == MPIBufs::bufRecv)
-                            oss << "_recv_halo_from_" << neigh_rank << "_to_" << me;
-                        string bufname = oss.str();
-
                         // Make MPI data entry for this grid.
                         auto gbp = mpiData.emplace(gname, _mpiInfo);
                         auto& gbi = gbp.first; // iterator from pair returned by emplace().
@@ -625,18 +640,22 @@ namespace yask {
                         buf.name = bufname;
                         buf.has_all_vlen_mults = vlen_mults;
                         
-                        TRACE_MSG("configured MPI buffer object '" << buf.name <<
-                                  "' for rank at relative offsets " <<
+                        TRACE_MSG("MPI buffer '" << buf.name <<
+                                  "' configured for rank at relative offsets " <<
                                   neigh_offsets.subElements(1).makeDimValStr() << " with " <<
                                   buf.num_pts.makeDimValStr(" * ") << " = " << buf.get_size() <<
                                   " element(s) at " << buf.begin_pt.makeDimValStr() <<
                                   " ... " << buf.last_pt.makeDimValStr());
-                        num_exchanges++;
+                        num_exchanges[bd]++;
+                        num_elems[bd] += buf.get_size();
 
                     } // send, recv.
                 } // grids.
             });   // neighbors.
-        TRACE_MSG("number of halo-exchanges needed on this rank: " << num_exchanges);
+        TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::bufSend)]);
+        TRACE_MSG("number of elements in send buffers: " << makeNumStr(num_elems[int(MPIBufs::bufSend)]));
+        TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::bufRecv)]);
+        TRACE_MSG("number of elements in recv buffers: " << makeNumStr(num_elems[int(MPIBufs::bufRecv)]));
 
         // Base ptrs for all alloc'd data.
         // These pointers will be shared by the ones in the grid