From 6d2cd4fc8ffe1dfda3dab08f9d902e943cda7f83 Mon Sep 17 00:00:00 2001
From: Chuck Yount <chuck.yount@intel.com>
Date: Sat, 28 Apr 2018 13:16:22 -0700
Subject: [PATCH] Fix bug in rounding up indices & sizes for vectorized halo
 exchanges. Ver 2.06.02.

---
 src/common/common_utils.cpp |  2 +-
 src/kernel/lib/context.cpp  |  4 +-
 src/kernel/lib/settings.hpp |  4 +-
 src/kernel/lib/setup.cpp    | 95 +++++++++++++++++++++++++------------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index a5c5504b..fc430a4d 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.06.01";
+    const string version = "2.06.02";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 7c521bf0..b70596e0 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -1281,7 +1281,7 @@ namespace yask {
                                     // Vec ok?
                                     // Domain sizes must be ok, and buffer size must be ok
                                     // as calculated when buffers were created.
-                                    bool send_vec_ok = vec_ok && sendBuf.has_all_vlen_mults;
+                                    bool send_vec_ok = vec_ok && sendBuf.vec_copy_ok;
 
                                     // Get first and last ranges.
                                     IdxTuple first = sendBuf.begin_pt;
@@ -1330,7 +1330,7 @@ namespace yask {
                                     MPI_Wait(&grid_recv_reqs[ni], MPI_STATUS_IGNORE);
 
                                     // Vec ok?
-                                    bool recv_vec_ok = vec_ok && recvBuf.has_all_vlen_mults;
+                                    bool recv_vec_ok = vec_ok && recvBuf.vec_copy_ok;
 
                                     // Get first and last ranges.
                                     IdxTuple first = recvBuf.begin_pt;
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
index 5545de7e..a2532cfd 100644
--- a/src/kernel/lib/settings.hpp
+++ b/src/kernel/lib/settings.hpp
@@ -633,8 +633,8 @@ namespace yask {
         IdxTuple num_pts;
 
         // Whether the number of points is a multiple of the
-        // vector length in all dims.
-        bool has_all_vlen_mults = false;
+        // vector length in all dims and buffer is aligned.
+        bool vec_copy_ok = false;
 
         // Number of points overall.
         idx_t get_size() const {
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index d4aacc8f..02797647 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -371,7 +371,7 @@ namespace yask {
                 }
         
                 // Is vectorized exchange allowed based on domain sizes?
-                // Both my rank and neighbor rank must have all domain sizes
+                // Both my rank and neighbor rank must have *all* domain sizes
                 // of vector multiples.
                 bool vec_ok = allow_vec_exchange &&
                     _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
@@ -390,6 +390,7 @@ namespace yask {
                     if (!gp || gp->is_scratch() || gp->is_fixed_size())
                         continue;
                     auto& gname = gp->get_name();
+                    bool grid_vec_ok = vec_ok;
 
                     // Lookup first & last domain indices and calc exchange sizes
                     // for this grid.
@@ -402,6 +403,9 @@ namespace yask {
 
                         // Only consider domain dims that are used in this grid.
                         if (gp->is_dim_used(dname)) {
+                            auto vlen = _dims->_fold_pts[dname];
+                            auto lhalo = gp->get_left_halo_size(dname);
+                            auto rhalo = gp->get_right_halo_size(dname);
 
                             // Get domain indices for this grid.  If there
                             // are no more ranks in the given direction,
@@ -413,12 +417,25 @@ namespace yask {
                             first_inner_idx.addDimBack(dname, fidx);
                             last_inner_idx.addDimBack(dname, lidx);
                             if (_opts->is_first_rank(dname))
-                                fidx -= gp->get_left_halo_size(dname);
+                                fidx -= lhalo;
                             if (_opts->is_last_rank(dname))
-                                lidx += gp->get_right_halo_size(dname);
+                                lidx += rhalo;
                             first_outer_idx.addDimBack(dname, fidx);
                             last_outer_idx.addDimBack(dname, lidx);
 
+                            // Determine if it is possible to round the
+                            // outer indices to vec-multiples. This will
+                            // be required to allow full vec exchanges for
+                            // this grid. We won't do the actual rounding
+                            // yet, because we need to see if it's safe
+                            // in all dims.
+                            fidx = round_down_flr(fidx, vlen);
+                            lidx = round_up_flr(lidx, vlen);
+                            if (fidx < gp->get_first_rank_alloc_index(dname))
+                                grid_vec_ok = false;
+                            if (lidx > gp->get_last_rank_alloc_index(dname))
+                                grid_vec_ok = false;
+                            
                             // Determine size of exchange in this dim. This
                             // will be the actual halo size plus any
                             // wave-front shifts. In the current
@@ -432,15 +449,12 @@ namespace yask {
                             if (neigh_offsets[dname] == MPIInfo::rank_prev) {
                                 auto ext = wf_shifts[dname];
 
-                                // my halo on my left.
-                                auto halo_size = gp->get_left_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
+                                // My halo on my left.
+                                my_halo_sizes.addDimBack(dname, lhalo + ext);
 
-                                // neighbor halo on their right.
-                                halo_size = gp->get_right_halo_size(dname); // assume their right == my right.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                                // Neighbor halo on their right.
+                                // Assume my right is same as their right.
+                                neigh_halo_sizes.addDimBack(dname, rhalo + ext);
 
                                 // Flag that this grid has a neighbor to left or right.
                                 found_delta = true;
@@ -450,15 +464,12 @@ namespace yask {
                             else if (neigh_offsets[dname] == MPIInfo::rank_next) {
                                 auto ext = wf_shifts[dname];
 
-                                // my halo on my right.
-                                auto halo_size = gp->get_right_halo_size(dname);
-                                halo_size += ext;
-                                my_halo_sizes.addDimBack(dname, halo_size);
+                                // My halo on my right.
+                                my_halo_sizes.addDimBack(dname, rhalo + ext);
 
-                                // neighbor halo on their left.
-                                halo_size = gp->get_left_halo_size(dname); // assume their left == my left.
-                                halo_size += ext;
-                                neigh_halo_sizes.addDimBack(dname, halo_size);
+                                // Neighbor halo on their left.
+                                // Assume my left is same as their left.
+                                neigh_halo_sizes.addDimBack(dname, lhalo + ext);
 
                                 // Flag that this grid has a neighbor to left or right.
                                 found_delta = true;
@@ -470,13 +481,6 @@ namespace yask {
                                 neigh_halo_sizes.addDimBack(dname, 0);
                             }
 
-                            // Round up halo sizes if vectorized exchanges allowed.
-                            // TODO: add a heuristic to avoid increasing by a large factor.
-                            if (vec_ok) {
-                                auto vec_size = _dims->_fold_pts[dname];
-                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vec_size));
-                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vec_size));
-                            }
                         } // domain dims in this grid.
                     } // domain dims.
 
@@ -491,6 +495,31 @@ namespace yask {
                         continue; // to next grid.
                     }
 
+                    // Round halo sizes if vectorized exchanges allowed.
+                    // Both self and neighbor must be vec-multiples
+                    // and outer indices must be vec-mults or extendable
+                    // to be so.
+                    // TODO: add a heuristic to avoid increasing by a large factor.
+                    if (grid_vec_ok) {
+                        for (auto& dim : _dims->_domain_dims.getDims()) {
+                            auto& dname = dim.getName();
+                            if (gp->is_dim_used(dname)) {
+                                auto vlen = _dims->_fold_pts[dname];
+
+                                // first index rounded down.
+                                first_outer_idx.setVal(dname, round_down_flr(first_outer_idx[dname], vlen));
+
+                                // last index rounded up.
+                                last_outer_idx.setVal(dname, round_up_flr(last_outer_idx[dname], vlen));
+                                
+                                // sizes rounded up.
+                                my_halo_sizes.setVal(dname, ROUND_UP(my_halo_sizes[dname], vlen));
+                                neigh_halo_sizes.setVal(dname, ROUND_UP(neigh_halo_sizes[dname], vlen));
+                                    
+                            } // domain dims in this grid.
+                        } // domain dims.
+                    }
+                    
                     // Make a buffer in both directions (send & receive).
                     for (int bd = 0; bd < MPIBufs::nBufDirs; bd++) {
 
@@ -498,7 +527,7 @@ namespace yask {
                         // of main grid to read from or write to based on
                         // the current neighbor being processed.
                         IdxTuple copy_begin = gp->get_allocs();
-                        IdxTuple copy_end = gp->get_allocs();
+                        IdxTuple copy_end = gp->get_allocs(); // one past last!
 
                         // Adjust along domain dims in this grid.
                         for (auto& dim : _dims->_domain_dims.getDims()) {
@@ -516,13 +545,15 @@ namespace yask {
                             
                                 // Region to read from, i.e., data from inside
                                 // this rank's domain to be put into neighbor's
-                                // halo.
+                                // halo. So, use neighbor's halo sizes when
+                                // calculating buffer size.
                                 if (bd == MPIBufs::bufSend) {
 
                                     // Neighbor is to the left.
                                     if (neigh_ofs == idx_t(MPIInfo::rank_prev)) {
 
                                         // Only read slice as wide as halo from beginning.
+                                        copy_begin[dname] = first_inner_idx[dname];
                                         copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname];
                                     }
                             
@@ -531,6 +562,7 @@ namespace yask {
                                     
                                         // Only read slice as wide as halo before end.
                                         copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname];
+                                        copy_end[dname] = last_inner_idx[dname] + 1;
                                     }
                             
                                     // Else, this neighbor is in same posn as I am in this dim,
@@ -538,6 +570,7 @@ namespace yask {
                                 }
                         
                                 // Region to write to, i.e., into this rank's halo.
+                                // So, use my halo sizes when calculating buffer sizes.
                                 else if (bd == MPIBufs::bufRecv) {
 
                                     // Neighbor is to the left.
@@ -573,10 +606,12 @@ namespace yask {
                             if (_dims->_domain_dims.lookup(dname)) {
                                 dsize = copy_end[dname] - copy_begin[dname];
 
-                                // Check whether size is multiple of vlen.
+                                // Check whether alignment and size are multiple of vlen.
                                 auto vlen = _dims->_fold_pts[dname];
                                 if (dsize % vlen != 0)
                                     vlen_mults = false;
+                                if (imod_flr(copy_begin[dname], vlen) != 0)
+                                    vlen_mults = false;
                             }
 
                             // step dim?
@@ -638,7 +673,7 @@ namespace yask {
                         buf.last_pt = copy_last;
                         buf.num_pts = buf_sizes;
                         buf.name = bufname;
-                        buf.has_all_vlen_mults = vlen_mults;
+                        buf.vec_copy_ok = vlen_mults;
                         
                         TRACE_MSG("MPI buffer '" << buf.name <<
                                   "' configured for rank at relative offsets " <<