From aeba7d72c809e06eae446c5da53703736b694ab5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 29 Jan 2024 12:10:09 -0800
Subject: [PATCH 01/21] Prolongator now multi-RHS aware

---
 include/kernels/prolongator.cuh   | 62 ++++++++++++++++++++----------
 include/transfer.h                |  8 ++--
 lib/prolongator.in.cpp            | 14 +++----
 lib/prolongator.in.cu             | 61 ++++++++++++++++-------------
 lib/staggered_prolong_restrict.cu |  8 ++++
 lib/transfer.cpp                  | 64 ++++++++++++++++++++-----------
 6 files changed, 137 insertions(+), 80 deletions(-)
diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh
index 1075b5e8d5..d4296d41b1 100644
--- a/include/kernels/prolongator.cuh
+++ b/include/kernels/prolongator.cuh
@@ -22,37 +22,55 @@ namespace quda {
     static constexpr int fineColor = fineColor_;
     static constexpr int coarseColor = coarseColor_;
 
-    FieldOrderCB<Float,fineSpin,fineColor,1, colorspinor::getNative<Float>(fineSpin)> out;;
-    const FieldOrderCB<Float,coarseSpin,coarseColor,1, colorspinor::getNative<Float>(coarseSpin)> in;
-    const FieldOrderCB<Float,fineSpin,fineColor,coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat> V;
+    // disable ghost to reduce arg size
+    using F = FieldOrderCB<Float, fineSpin, fineColor, 1, colorspinor::getNative<Float>(fineSpin), Float, Float, true>;
+    using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
+    using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat, vFloat>;
+
+    static constexpr unsigned int max_n_src = 64;
+    const int_fastdiv n_src;
+    F out[max_n_src];
+    C in[max_n_src];
+    const V v;
     const int *geo_map;  // need to make a device copy of this
     const spin_mapper<fineSpin,coarseSpin> spin_map;
     const int parity; // the parity of the output field (if single parity)
     const int nParity; // number of parities of input fine field
 
-    ProlongateArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
+    ProlongateArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *geo_map,  const int parity) :
-      kernel_param(dim3(out.VolumeCB(), out.SiteSubset(), fineColor/fine_colors_per_thread<fineColor, coarseColor>())),
-      out(out), in(in), V(V), geo_map(geo_map), spin_map(), parity(parity), nParity(out.SiteSubset())
-    { }
+      kernel_param(dim3(out[0].VolumeCB(), out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread<fineColor, coarseColor>())),
+      n_src(out.size()),
+      v(v),
+      geo_map(geo_map),
+      spin_map(),
+      parity(parity),
+      nParity(out[0].SiteSubset())
+    {
+      if (out.size() > max_n_src) errorQuda("vector set size %lu greater than max size %d", out.size(), max_n_src);
+      for (auto i = 0u; i < out.size(); i++) {
+        this->out[i] = out[i];
+        this->in[i] = in[i];
+      }
+    }
   };
 
   /**
      Applies the grid prolongation operator (coarse to fine)
   */
   template <typename Arg>
-  __device__ __host__ inline void prolongate(complex<typename Arg::real> out[], const Arg &arg, int parity, int x_cb)
+  __device__ __host__ inline void prolongate(complex<typename Arg::real> out[], const Arg &arg, int src_idx, int parity, int x_cb)
   {
-    int x = parity*arg.out.VolumeCB() + x_cb;
+    int x = parity * arg.out[src_idx].VolumeCB() + x_cb;
     int x_coarse = arg.geo_map[x];
-    int parity_coarse = (x_coarse >= arg.in.VolumeCB()) ? 1 : 0;
-    int x_coarse_cb = x_coarse - parity_coarse*arg.in.VolumeCB();
+    int parity_coarse = (x_coarse >= arg.in[src_idx].VolumeCB()) ? 1 : 0;
+    int x_coarse_cb = x_coarse - parity_coarse * arg.in[src_idx].VolumeCB();
 
 #pragma unroll
     for (int s=0; s<Arg::fineSpin; s++) {
 #pragma unroll
       for (int c=0; c<Arg::coarseColor; c++) {
-        out[s*Arg::coarseColor+c] = arg.in(parity_coarse, x_coarse_cb, arg.spin_map(s,parity), c);
+        out[s*Arg::coarseColor+c] = arg.in[src_idx](parity_coarse, x_coarse_cb, arg.spin_map(s,parity), c);
       }
     }
   }
@@ -62,11 +80,11 @@ namespace quda {
      is the second step of applying the prolongator.
   */
   template <typename Arg>
-  __device__ __host__ inline void rotateFineColor(const Arg &arg, const complex<typename Arg::real> in[], int parity, int x_cb, int fine_color_block)
+  __device__ __host__ inline void rotateFineColor(const Arg &arg, const complex<typename Arg::real> in[], int src_idx, int parity, int x_cb, int fine_color_block)
   {
     constexpr int fine_color_per_thread = fine_colors_per_thread<Arg::fineColor, Arg::coarseColor>();
     const int spinor_parity = (arg.nParity == 2) ? parity : 0;
-    const int v_parity = (arg.V.Nparity() == 2) ? parity : 0;
+    const int v_parity = (arg.v.Nparity() == 2) ? parity : 0;
 
     constexpr int color_unroll = 2;
 
@@ -82,15 +100,15 @@ namespace quda {
 
 #pragma unroll
         for (int j=0; j<Arg::coarseColor; j+=color_unroll) {
-          // V is a ColorMatrixField with internal dimensions Ns * Nc * Nvec
+          // v is a ColorMatrixField with internal dimensions Ns * Nc * Nvec
 #pragma unroll
           for (int k=0; k<color_unroll; k++)
-            partial[k] = cmac(arg.V(v_parity, x_cb, s, i, j + k), in[s * Arg::coarseColor + j + k], partial[k]);
+            partial[k] = cmac(arg.v(v_parity, x_cb, s, i, j + k), in[s * Arg::coarseColor + j + k], partial[k]);
         }
 
 #pragma unroll
         for (int k=1; k<color_unroll; k++) partial[0] += partial[k];
-        arg.out(spinor_parity, x_cb, s, i) = partial[0];
+        arg.out[src_idx](spinor_parity, x_cb, s, i) = partial[0];
       }
     }
 
@@ -103,13 +121,15 @@ namespace quda {
     constexpr Prolongator(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    __device__ __host__ inline void operator()(int x_cb, int parity, int fine_color_thread)
+    __device__ __host__ inline void operator()(int x_cb, int src_parity, int fine_color_thread)
     {
-      if (arg.nParity == 1) parity = arg.parity;
+      int src_idx = src_parity % arg.n_src;
+      int parity = (arg.nParity == 2) ? (src_parity / arg.n_src) : arg.parity;
       const int fine_color_block = fine_color_thread * fine_color_per_thread;
       complex<typename Arg::real> tmp[Arg::fineSpin*Arg::coarseColor];
-      prolongate(tmp, arg, parity, x_cb);
-      rotateFineColor(arg, tmp, parity, x_cb, fine_color_block);
+
+      prolongate(tmp, arg, src_idx, parity, x_cb);
+      rotateFineColor(arg, tmp, src_idx, parity, x_cb, fine_color_block);
     }
   };
 
diff --git a/include/transfer.h b/include/transfer.h
index 87500f0fd8..01d3e6c50f 100644
--- a/include/transfer.h
+++ b/include/transfer.h
@@ -178,7 +178,7 @@ namespace quda {
      * @param out The resulting field on the fine lattice
      * @param in The input field on the coarse lattice
      */
-    void P(ColorSpinorField &out, const ColorSpinorField &in) const;
+    void P(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const;
 
     /**
      * Apply the restrictor
@@ -313,11 +313,11 @@ namespace quda {
      @param[in] spin_map Spin blocking lookup table
      @param[in] parity of the output fine field (if single parity output field)
    */
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
   template <int coarseColor, int fineColor>
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
   /**
@@ -345,7 +345,7 @@ namespace quda {
      @param[in] spin_map Spin blocking lookup table
      @param[in] parity of the output fine field (if single parity output field)
    */
-  void StaggeredProlongate(ColorSpinorField &out, const ColorSpinorField &in, const int *fine_to_coarse,
+  void StaggeredProlongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const int *fine_to_coarse,
                            const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
   /**
diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp
index 008d73de26..0f05524289 100644
--- a/lib/prolongator.in.cpp
+++ b/lib/prolongator.in.cpp
@@ -7,10 +7,10 @@ namespace quda
   };
 
   template <int fineColor, int coarseColor, int... N>
-  void Prolongate2(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate2(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                    const int *fine_to_coarse, const int *const *spin_map, int parity, IntList<coarseColor, N...>)
   {
-    if (in.Ncolor() == coarseColor) {
+    if (in[0].Ncolor() == coarseColor) {
       if constexpr (coarseColor >= fineColor) {
         Prolongate<fineColor, coarseColor>(out, in, v, fine_to_coarse, spin_map, parity);
       } else {
@@ -20,16 +20,16 @@ namespace quda
       if constexpr (sizeof...(N) > 0) {
         Prolongate2<fineColor>(out, in, v, fine_to_coarse, spin_map, parity, IntList<N...>());
       } else {
-        errorQuda("Coarse Nc = %d has not been instantiated", in.Ncolor());
+        errorQuda("Coarse Nc = %d has not been instantiated", in[0].Ncolor());
       }
     }
   }
 
   template <int fineColor, int... N>
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int *const *spin_map, int parity, IntList<fineColor, N...>)
   {
-    if (out.Ncolor() == fineColor) {
+    if (out[0].Ncolor() == fineColor) {
       // clang-format off
       IntList<@QUDA_MULTIGRID_NVEC_LIST@> coarseColors;
       // clang-format on
@@ -38,12 +38,12 @@ namespace quda
       if constexpr (sizeof...(N) > 0) {
         Prolongate(out, in, v, fine_to_coarse, spin_map, parity, IntList<N...>());
       } else {
-        errorQuda("Fine Nc = %d has not been instantiated", out.Ncolor());
+        errorQuda("Fine Nc = %d has not been instantiated", out[0].Ncolor());
       }
     }
   }
 
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int *const *spin_map, int parity)
   {
     if constexpr (is_enabled_multigrid()) {
diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu
index 43f6b2a018..8320ca5e8f 100644
--- a/lib/prolongator.in.cu
+++ b/lib/prolongator.in.cu
@@ -9,50 +9,59 @@ namespace quda {
   class ProlongateLaunch : public TunableKernel3D {
     using Arg = ProlongateArg<Float,vFloat,fineSpin,fineColor,coarseSpin,coarseColor>;
 
-    ColorSpinorField &out;
-    const ColorSpinorField &in;
+    cvector_ref<ColorSpinorField> &out;
+    cvector_ref<const ColorSpinorField> &in;
     const ColorSpinorField &V;
     const int *fine_to_coarse;
     int parity;
     QudaFieldLocation location;
 
-    unsigned int minThreads() const { return out.VolumeCB(); } // fine parity is the block y dimension
+    unsigned int minThreads() const { return out[0].VolumeCB(); } // fine parity is the block y dimension
 
   public:
-    ProlongateLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
+    ProlongateLaunch(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &V,
                      const int *fine_to_coarse, int parity)
-      : TunableKernel3D(in, out.SiteSubset(), fineColor/fine_colors_per_thread<fineColor, coarseColor>()), out(out), in(in), V(V),
-        fine_to_coarse(fine_to_coarse), parity(parity), location(checkLocation(out, in, V))
+      : TunableKernel3D(in[0], out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread<fineColor, coarseColor>()),
+        out(out),
+        in(in),
+        V(V),
+        fine_to_coarse(fine_to_coarse),
+        parity(parity),
+        location(checkLocation(out[0], in[0], V))
     {
       strcat(vol, ",");
-      strcat(vol, out.VolString().c_str());
+      strcat(vol, out[0].VolString().c_str());
       strcat(aux, ",");
-      strcat(aux, out.AuxString().c_str());
+      strcat(aux, out[0].AuxString().c_str());
 
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream) {
-      if (checkNative(out, in, V)) {
+    void apply(const qudaStream_t &stream)
+    {
+      if (checkNative(out[0], in[0], V)) {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
         launch<Prolongator>(tp, stream, Arg(out, in, V, fine_to_coarse, parity));
       }
     }
 
-    long long flops() const { return 8 * fineSpin * fineColor * coarseColor * out.SiteSubset()*(long long)out.VolumeCB(); }
+    long long flops() const
+    {
+      return out.size() * 8 * fineSpin * fineColor * coarseColor * out[0].SiteSubset() * out[0].VolumeCB();
+    }
 
     long long bytes() const {
-      size_t v_bytes = V.Bytes() / (V.SiteSubset() == out.SiteSubset() ? 1 : 2);
-      return in.Bytes() + out.Bytes() + v_bytes + out.SiteSubset()*out.VolumeCB()*sizeof(int);
+      size_t v_bytes = V.Bytes() / (V.SiteSubset() == out[0].SiteSubset() ? 1 : 2);
+      return out.size() * (in[0].Bytes() + out[0].Bytes() + v_bytes + out[0].SiteSubset() * out[0].VolumeCB() * sizeof(int));
     }
 
   };
 
   template <typename Float, int fineSpin, int fineColor, int coarseColor>
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int * const * spin_map, int parity)
   {
-    if (in.Nspin() != 2) errorQuda("Coarse spin %d is not supported", in.Nspin());
+    if (in[0].Nspin() != 2) errorQuda("Coarse spin %d is not supported", in[0].Nspin());
     constexpr int coarseSpin = 2;
 
     // first check that the spin_map matches the spin_mapper
@@ -69,7 +78,7 @@ namespace quda {
       } else {
         errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
       }
-    } else if (v.Precision() == in.Precision()) {
+    } else if (v.Precision() == in[0].Precision()) {
       ProlongateLaunch<Float, Float, fineSpin, fineColor, coarseSpin, coarseColor>
         prolongator(out, in, v, fine_to_coarse, parity);
     } else {
@@ -78,25 +87,25 @@ namespace quda {
   }
 
   template <typename Float, int fineColor, int coarseColor>
-  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *fine_to_coarse, const int * const * spin_map, int parity)
   {
-    if (!is_enabled_spin(out.Nspin())) errorQuda("nSpin %d has not been built", in.Nspin());
+    if (!is_enabled_spin(out[0].Nspin())) errorQuda("nSpin %d has not been built", in[0].Nspin());
 
-    if (out.Nspin() == 2) {
+    if (out[0].Nspin() == 2) {
       Prolongate<Float, 2, fineColor, coarseColor>(out, in, v, fine_to_coarse, spin_map, parity);
     } else if constexpr (fineColor == 3) {
-      if (out.Nspin() == 4) {
+      if (out[0].Nspin() == 4) {
         if constexpr (is_enabled_spin(4))
           Prolongate<Float, 4, fineColor, coarseColor>(out, in, v, fine_to_coarse, spin_map, parity);
-      } else if (out.Nspin() == 1) {
+      } else if (out[0].Nspin() == 1) {
         if constexpr (is_enabled_spin(1))
           Prolongate<Float, 1, fineColor, coarseColor>(out, in, v, fine_to_coarse, spin_map, parity);
       } else {
-        errorQuda("Unsupported nSpin %d", out.Nspin());
+        errorQuda("Unsupported nSpin %d", out[0].Nspin());
       }
     } else {
-      errorQuda("Unexpected spin %d and color %d combination", out.Nspin(), out.Ncolor());
+      errorQuda("Unexpected spin %d and color %d combination", out[0].Nspin(), out[0].Ncolor());
     }
   }
 
@@ -104,11 +113,11 @@ namespace quda {
   constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@;
 
   template <>
-  void Prolongate<fineColor, coarseColor>(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Prolongate<fineColor, coarseColor>(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                                           const int *fine_to_coarse, const int * const * spin_map, int parity)
   {
     if constexpr (is_enabled_multigrid()) {
-      QudaPrecision precision = checkPrecision(out, in);
+      QudaPrecision precision = checkPrecision(out[0], in[0]);
 
       if (precision == QUDA_DOUBLE_PRECISION) {
         if constexpr (is_enabled_multigrid_double())
@@ -118,7 +127,7 @@ namespace quda {
       } else if (precision == QUDA_SINGLE_PRECISION) {
         Prolongate<float, fineColor, coarseColor>(out, in, v, fine_to_coarse, spin_map, parity);
       } else {
-        errorQuda("Unsupported precision %d", out.Precision());
+        errorQuda("Unsupported precision %d", out[0].Precision());
       }
     } else {
       errorQuda("Multigrid has not been built");
diff --git a/lib/staggered_prolong_restrict.cu b/lib/staggered_prolong_restrict.cu
index 39aa07365c..a8ae5ad604 100644
--- a/lib/staggered_prolong_restrict.cu
+++ b/lib/staggered_prolong_restrict.cu
@@ -122,4 +122,12 @@ namespace quda {
   void StaggeredRestrict(ColorSpinorField &, const ColorSpinorField &, const int *, const int * const *, int) { errorQuda("Staggered multigrid has not been build"); }
 #endif
 
+  void StaggeredProlongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                           const int *fine_to_coarse, const int * const * spin_map, int parity)
+  {
+    for (auto i = 0u; i < out.size(); i++) {
+      StaggeredProlongateRestrict<StaggeredTransferType::STAGGERED_TRANSFER_PROLONG>(out[i], in[i], fine_to_coarse, spin_map, parity);
+    }
+  }
+
 } // end namespace quda
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index 12821c4618..dab93f2304 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -356,60 +356,80 @@ namespace quda {
   }
 
   // apply the prolongator
-  void Transfer::P(ColorSpinorField &out, const ColorSpinorField &in) const {
+  void Transfer::P(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    ColorSpinorField *input = const_cast<ColorSpinorField*>(&in);
-    ColorSpinorField *output = &out;
     initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION);
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
 
     if (transfer_type == QUDA_TRANSFER_COARSE_KD) {
-      StaggeredProlongate(*output, *input, fine_to_coarse, spin_map, parity);
+      StaggeredProlongate(out, in, fine_to_coarse, spin_map, parity);
     } else if (transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) {
 
-      if (in.SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors");
-
-      if (output->VolumeCB() != input->VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes");
+      if (in[0].SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors");
+      if (out[0].VolumeCB() != in[0].VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes");
 
       // the optimized KD op acts on fine spinors
-      if (out.SiteSubset() == QUDA_PARITY_SITE_SUBSET) {
-        *output = input->Even();
+      if (out[0].SiteSubset() == QUDA_PARITY_SITE_SUBSET) {
+        for (auto i = 0u; i < out.size(); i++) out[i] = in[i].Even();
       } else {
-        *output = *input;
+        for (auto i = 0u; i < out.size(); i++) out[i] = in[i];
       }
 
     } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) {
+      std::vector<ColorSpinorField> input(in.size());
+      std::vector<ColorSpinorField> output(out.size());
 
       const ColorSpinorField *V = use_gpu ? V_d : V_h;
 
+      // the below are borked for m-rhs since we have only a single tmp available
       if (use_gpu) {
-        if (in.Location() == QUDA_CPU_FIELD_LOCATION) input = coarse_tmp_d;
-        if (out.Location() == QUDA_CPU_FIELD_LOCATION || out.GammaBasis() != V->GammaBasis())
-          output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
+
+        // set input fields
+        if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) {
+          for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d->create_alias();
+        } else {
+          for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField&>(in[i]).create_alias();
+        }
+
+        // set output fields
+        if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) {
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d->create_alias() : fine_tmp_d->Even().create_alias();
+        } else {
+          for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
+        }
         if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
       } else {
-        if (out.Location() == QUDA_CUDA_FIELD_LOCATION)
-          output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
+
+        // set input fields
+        for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField&>(in[i]).create_alias();
+
+        // set output fields
+        if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h->create_alias() : fine_tmp_h->Even().create_alias();
+        } else {
+          for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
+        }
+
       }
 
-      *input = in; // copy result to input field (aliasing handled automatically)
+      for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not?
 
-      if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out.SiteSubset() == QUDA_FULL_SITE_SUBSET)
+      if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET)
         errorQuda("Cannot prolongate to a full field since only have single parity null-space components");
 
-      if ((V->Nspin() != 1) && ((output->GammaBasis() != V->GammaBasis()) || (input->GammaBasis() != V->GammaBasis()))) {
+      if ((V->Nspin() != 1) && ((output[0].GammaBasis() != V->GammaBasis()) || (input[0].GammaBasis() != V->GammaBasis()))) {
         errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d",
-                  output->GammaBasis(), in.GammaBasis(), V->GammaBasis());
+                  output[0].GammaBasis(), in[0].GammaBasis(), V->GammaBasis());
       }
 
-      Prolongate(*output, *input, *V, fine_to_coarse, spin_map, parity);
+      Prolongate(output, input, *V, fine_to_coarse, spin_map, parity);
+
+      for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically)
     } else {
       errorQuda("Invalid transfer type in prolongate");
     }
 
-    out = *output; // copy result to out field (aliasing handled automatically)
-
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 

From 3fa6e95d431722eda0bc8830dfce6bfa5995367d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 29 Jan 2024 16:51:57 -0800
Subject: [PATCH 02/21] Create new functions ColorSpinorField::create_coarse
 and ColorSpinorField::create_fine, these will eventually replace
 ColorSpinorField::CreateCoarse and ColorSpinorField::CreateFine

---
 include/color_spinor_field.h | 28 ++++++++++++++
 lib/color_spinor_field.cpp   | 72 ++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index f6fc42d194..923678dbaa 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -819,6 +819,34 @@ namespace quda
                                  QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION,
                                  QudaMemoryType mem_type = QUDA_MEMORY_INVALID);
 
+    /**
+       @brief Create a coarse color-spinor field, using this field to set the meta data
+       @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions
+       @param[in] spinlockSize Geometric block size that defines the coarse spin dimension
+       @param[in] Nvec Number of coarse color degrees of freedom per grid point
+       @param[in] precision Optionally set the precision of the fine field
+       @param[in] location Optionally set the location of the coarse field
+       @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory)
+    */
+    ColorSpinorField create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec,
+                                   QudaPrecision precision = QUDA_INVALID_PRECISION,
+                                   QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION,
+                                   QudaMemoryType mem_Type = QUDA_MEMORY_INVALID);
+
+    /**
+       @brief Create a fine color-spinor field, using this field to set the meta data
+       @param[in] geoBlockSize Geometric block size that defines the fine grid dimensions
+       @param[in] spinlockSize Geometric block size that defines the fine spin dimension
+       @param[in] Nvec Number of fine color degrees of freedom per grid point
+       @param[in] precision Optionally set the precision of the fine field
+       @param[in] location Optionally set the location of the fine field
+       @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory)
+    */
+    ColorSpinorField create_fine(const int *geoblockSize, int spinBlockSize, int Nvec,
+                                 QudaPrecision precision = QUDA_INVALID_PRECISION,
+                                 QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION,
+                                 QudaMemoryType mem_type = QUDA_MEMORY_INVALID);
+
     /**
        @brief Backs up the ColorSpinorField
     */
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index d2a6c93d8d..e8fbfd56d5 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -923,6 +923,78 @@ namespace quda
     return new ColorSpinorField(fineParam);
   }
 
+  ColorSpinorField ColorSpinorField::create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec,
+                                                   QudaPrecision new_precision, QudaFieldLocation new_location,
+                                                   QudaMemoryType new_mem_type)
+  {
+    ColorSpinorParam coarseParam(*this);
+    for (int d = 0; d < nDim; d++) coarseParam.x[d] = x[d] / geoBlockSize[d];
+
+    int geoBlockVolume = 1;
+    for (int d = 0; d < nDim; d++) { geoBlockVolume *= geoBlockSize[d]; }
+
+    // Detect if the "coarse" op is the Kahler-Dirac op or something else
+    // that still acts on a fine staggered ColorSpinorField
+    if (geoBlockVolume == 1 && Nvec == nColor && nSpin == 1) {
+      coarseParam.nSpin = nSpin;
+      coarseParam.nColor = nColor;
+    } else {
+      coarseParam.nSpin = (nSpin == 1) ? 2 : (nSpin / spinBlockSize); // coarsening staggered check
+      coarseParam.nColor = Nvec;
+    }
+
+    coarseParam.siteSubset = QUDA_FULL_SITE_SUBSET; // coarse grid is always full
+    coarseParam.create = QUDA_ZERO_FIELD_CREATE;
+
+    // if new precision is not set, use this->precision
+    new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision;
+
+    // if new location is not set, use this->location
+    new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location;
+
+    coarseParam.fieldOrder = (new_location == QUDA_CUDA_FIELD_LOCATION) ?
+      colorspinor::getNative(new_precision, coarseParam.nSpin) :
+      QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+
+    coarseParam.setPrecision(new_precision);
+
+    // set where we allocate the field
+    coarseParam.mem_type = new_mem_type;
+
+    return ColorSpinorField(coarseParam);
+  }
+
+  ColorSpinorField ColorSpinorField::create_fine(const int *geoBlockSize, int spinBlockSize, int Nvec,
+                                                 QudaPrecision new_precision, QudaFieldLocation new_location,
+                                                 QudaMemoryType new_mem_type)
+  {
+    ColorSpinorParam fineParam(*this);
+    for (int d = 0; d < nDim; d++) fineParam.x[d] = x[d] * geoBlockSize[d];
+    fineParam.nSpin = nSpin * spinBlockSize;
+    fineParam.nColor = Nvec;
+    fineParam.siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME fine grid is always full
+    fineParam.create = QUDA_ZERO_FIELD_CREATE;
+
+    // if new precision is not set, use this->precision
+    new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision;
+
+    // if new location is not set, use this->location
+    new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location;
+
+    // for GPU fields, always use native ordering to ensure coalescing
+    if (new_location == QUDA_CUDA_FIELD_LOCATION) {
+      fineParam.setPrecision(new_precision, new_precision, true);
+    } else {
+      fineParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+      fineParam.setPrecision(new_precision);
+    }
+
+    // set where we allocate the field
+    fineParam.mem_type = new_mem_type;
+
+    return ColorSpinorField(fineParam);
+  }
+
   // legacy CPU static ghost destructor
   void ColorSpinorField::freeGhostBuffer(void)
   {

From 75646d3888a90c782e2a36fa83fcdbfcd53f8148 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 29 Jan 2024 16:54:58 -0800
Subject: [PATCH 03/21] Upgrade Transfer temporary allocation to support mrhs

---
 include/transfer.h | 32 ++++++++++++-----------
 lib/transfer.cpp   | 64 +++++++++++++++++-----------------------------
 2 files changed, 41 insertions(+), 55 deletions(-)

diff --git a/include/transfer.h b/include/transfer.h
index 01d3e6c50f..d8f9485d18 100644
--- a/include/transfer.h
+++ b/include/transfer.h
@@ -47,47 +47,47 @@ namespace quda {
     const QudaPrecision null_precision;
 
     /** CPU copy of the block-normalized null-space components that define the prolongator */
-    mutable ColorSpinorField *V_h;
+    mutable ColorSpinorField *V_h = nullptr;
 
     /** GPU copy of the block-normalized null-space components that define the prolongator */
-    mutable ColorSpinorField *V_d;
+    mutable ColorSpinorField *V_d = nullptr;
 
     /** A CPU temporary field with fine geometry and fine color we use for changing gamma basis */
-    mutable ColorSpinorField *fine_tmp_h;
+    mutable std::vector<ColorSpinorField> fine_tmp_h;
 
     /** A GPU temporary field with fine geometry and fine color we use for changing gamma basis */
-    mutable ColorSpinorField *fine_tmp_d;
+    mutable std::vector<ColorSpinorField> fine_tmp_d;
 
     /** A CPU temporary field with coarse geometry and coarse color */
-    mutable ColorSpinorField *coarse_tmp_h;
+    mutable std::vector<ColorSpinorField> coarse_tmp_h;
 
     /** A GPU temporary field with coarse geometry and coarse color we use for CPU input / output */
-    mutable ColorSpinorField *coarse_tmp_d;
+    mutable std::vector<ColorSpinorField> coarse_tmp_d;
 
     /** The geometrical coase grid blocking */
-    int *geo_bs;
+    int *geo_bs = nullptr;
 
     /** The mapping onto coarse sites from fine sites.  This has
 	length equal to the fine-grid volume, and is sorted into
 	lexicographical fine-grid order, with each value corresponding
 	to a coarse-grid offset. (CPU) */
-    mutable int *fine_to_coarse_h;
+    mutable int *fine_to_coarse_h = nullptr;
 
     /** The mapping onto fine sites from coarse sites. This has length
 	equal to the fine-grid volume, and is sorted into lexicographical
 	block order, with each value corresponding to a fine-grid offset. (CPU) */
-    mutable int *coarse_to_fine_h;
+    mutable int *coarse_to_fine_h = nullptr;
 
     /** The mapping onto coarse sites from fine sites.  This has
 	length equal to the fine-grid volume, and is sorted into
 	lexicographical fine-grid order, with each value corresponding
 	to a coarse-grid offset. (GPU) */
-    mutable int *fine_to_coarse_d;
+    mutable int *fine_to_coarse_d = nullptr;
 
     /** The mapping onto fine sites from coarse sites. This has length
 	equal to the fine-grid volume, and is sorted into lexicographical
 	block order, with each value corresponding to a fine-grid offset. (GPU) */
-    mutable int *coarse_to_fine_d;
+    mutable int *coarse_to_fine_d = nullptr;
 
     /** The spin blocking. Defined as zero when the fine operator is staggered. */
     int spin_bs;
@@ -105,10 +105,10 @@ namespace quda {
     QudaParity parity;
 
     /** Whether the GPU transfer operator has been constructed */
-    mutable bool enable_gpu;
+    mutable bool enable_gpu = false;
 
     /** Whether the CPU transfer operator has been constructed */
-    mutable bool enable_cpu;
+    mutable bool enable_cpu = false;
 
     /** Whether to apply the transfer operaton the GPU (requires
 	enable_gpu=true in the constructor) */
@@ -127,8 +127,9 @@ namespace quda {
     /**
      * @brief Allocate temporaries used when applying transfer operators
      * @param[in] location Where to allocate the temporaries
+     * @param[in] n_src Number of temporaries to allocate
      */
-    void createTmp(QudaFieldLocation location) const;
+    void createTmp(QudaFieldLocation location, size_t n_src) const;
 
     /**
      * @brief Creates the map between fine and coarse grids
@@ -145,8 +146,9 @@ namespace quda {
     /**
      * @brief Lazy allocation of the transfer operator in a given location
      * @param[in] location Where to allocate the temporaries
+     * @param[in] n_src Number of temporaries to allocate
      */
-    void initializeLazy(QudaFieldLocation location) const;
+    void initializeLazy(QudaFieldLocation location, size_t n_src) const;
 
   public:
     /**
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index dab93f2304..3ae1a415bd 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -26,24 +26,11 @@ namespace quda {
     NblockOrtho(n_block_ortho),
     blockOrthoTwoPass(block_ortho_two_pass),
     null_precision(null_precision),
-    V_h(nullptr),
-    V_d(nullptr),
-    fine_tmp_h(nullptr),
-    fine_tmp_d(nullptr),
-    coarse_tmp_h(nullptr),
-    coarse_tmp_d(nullptr),
-    geo_bs(nullptr),
-    fine_to_coarse_h(nullptr),
-    coarse_to_fine_h(nullptr),
-    fine_to_coarse_d(nullptr),
-    coarse_to_fine_d(nullptr),
     spin_bs(spin_bs),
     spin_map(0),
     nspin_fine(B[0]->Nspin()),
     site_subset(QUDA_FULL_SITE_SUBSET),
     parity(QUDA_INVALID_PARITY),
-    enable_gpu(false),
-    enable_cpu(false),
     use_gpu(true),
     transfer_type(transfer_type)
   {
@@ -109,7 +96,7 @@ namespace quda {
     }
 
     createV(B[0]->Location()); // allocate V field
-    createTmp(QUDA_CPU_FIELD_LOCATION); // allocate temporaries
+    createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation)
 
     // allocate and compute the fine-to-coarse and coarse-to-fine site maps
     fine_to_coarse_h = static_cast<int*>(pool_pinned_malloc(B[0]->Volume()*sizeof(int)));
@@ -173,7 +160,7 @@ namespace quda {
     postTrace();
   }
 
-  void Transfer::createTmp(QudaFieldLocation location) const
+  void Transfer::createTmp(QudaFieldLocation location, size_t n_src) const
   {
     // The CPU temporaries are needed for creating geometry mappings.
     if ((transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD
@@ -182,6 +169,9 @@ namespace quda {
       return;
     }
 
+    if (location == QUDA_CUDA_FIELD_LOCATION && fine_tmp_d.size() == n_src && coarse_tmp_d.size() == n_src) return;
+    if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return;
+
     postTrace();
     ColorSpinorParam param(*B[0]);
     param.create = QUDA_NULL_FIELD_CREATE;
@@ -191,29 +181,30 @@ namespace quda {
     if (param.Precision() < QUDA_SINGLE_PRECISION) param.setPrecision(QUDA_SINGLE_PRECISION);
 
     if (location == QUDA_CUDA_FIELD_LOCATION) {
-      if (fine_tmp_d && coarse_tmp_d) return;
-      fine_tmp_d = new ColorSpinorField(param);
-      coarse_tmp_d = fine_tmp_d->CreateCoarse(geo_bs, spin_bs, Nvec);
+      resize(fine_tmp_d, n_src, param);
+      coarse_tmp_d.push_back(fine_tmp_d[0].create_coarse(geo_bs, spin_bs, Nvec));
+      resize(coarse_tmp_d, n_src, ColorSpinorParam(coarse_tmp_d[0]));
     } else {
-      fine_tmp_h = new ColorSpinorField(param);
-      coarse_tmp_h = fine_tmp_h->CreateCoarse(geo_bs, spin_bs, Nvec);
+      resize(fine_tmp_h, n_src, param);
+      coarse_tmp_h.push_back(fine_tmp_h[0].create_coarse(geo_bs, spin_bs, Nvec));
+      resize(coarse_tmp_h, n_src, ColorSpinorParam(coarse_tmp_h[0]));
     }
     postTrace();
   }
 
-  void Transfer::initializeLazy(QudaFieldLocation location) const
+  void Transfer::initializeLazy(QudaFieldLocation location, size_t n_src) const
   {
     if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized");
 
     // delayed allocating this temporary until we need it
-    if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION);
+    //if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src);
+    createTmp(location, n_src);
 
     switch (location) {
     case QUDA_CUDA_FIELD_LOCATION:
       if (enable_gpu) return;
       createV(location);
       if (transfer_type == QUDA_TRANSFER_AGGREGATE) *V_d = *V_h;
-      createTmp(location);
       fine_to_coarse_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
       coarse_to_fine_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
       qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice);
@@ -270,12 +261,6 @@ namespace quda {
     if (V_h) delete V_h;
     if (V_d) delete V_d;
 
-    if (fine_tmp_h) delete fine_tmp_h;
-    if (fine_tmp_d) delete fine_tmp_d;
-
-    if (coarse_tmp_h) delete coarse_tmp_h;
-    if (coarse_tmp_d) delete coarse_tmp_d;
-
     if (geo_bs) delete []geo_bs;
   }
 
@@ -304,8 +289,8 @@ namespace quda {
 
     int x[QUDA_MAX_DIM];
 
-    ColorSpinorField &fine(*fine_tmp_h);
-    ColorSpinorField &coarse(*coarse_tmp_h);
+    ColorSpinorField &fine(fine_tmp_h[0]);
+    ColorSpinorField &coarse(coarse_tmp_h[0]);
 
     // compute the coarse grid point for every site (assuming parity ordering currently)
     for (size_t i = 0; i < fine.Volume(); i++) {
@@ -359,7 +344,7 @@ namespace quda {
   void Transfer::P(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION);
+    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size());
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
 
     if (transfer_type == QUDA_TRANSFER_COARSE_KD) {
@@ -382,19 +367,18 @@ namespace quda {
 
       const ColorSpinorField *V = use_gpu ? V_d : V_h;
 
-      // the below are borked for m-rhs since we have only a single tmp available
       if (use_gpu) {
 
         // set input fields
         if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) {
-          for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d->create_alias();
+          for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d[i].create_alias();
         } else {
           for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField&>(in[i]).create_alias();
         }
 
         // set output fields
         if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d->create_alias() : fine_tmp_d->Even().create_alias();
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -406,7 +390,7 @@ namespace quda {
 
         // set output fields
         if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h->create_alias() : fine_tmp_h->Even().create_alias();
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -440,7 +424,7 @@ namespace quda {
 
     ColorSpinorField *input = &const_cast<ColorSpinorField&>(in);
     ColorSpinorField *output = &out;
-    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION);
+    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1);
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
     const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
 
@@ -464,13 +448,13 @@ namespace quda {
       const ColorSpinorField *V = use_gpu ? V_d : V_h;
 
       if (use_gpu) {
-        if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = coarse_tmp_d;
+        if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = &coarse_tmp_d[0];
         if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis())
-          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
+          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_d[0] : &fine_tmp_d[0].Even();
         if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
       } else {
         if (in.Location() == QUDA_CUDA_FIELD_LOCATION)
-          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
+          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_h[0] : &fine_tmp_h[0].Even();
       }
 
       *input = in;

From 7f7fbdd381c4795adfa4e7acc68f3255fffb322c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 29 Jan 2024 19:42:17 -0800
Subject: [PATCH 04/21] Restrictor now multi-RHS aware

---
 include/kernels/restrictor.cuh    | 61 +++++++++++++++++--------
 include/transfer.h                | 12 ++---
 include/tunable_block_reduction.h |  2 +-
 lib/restrictor.in.cpp             | 20 ++++----
 lib/restrictor.in.cu              | 65 +++++++++++++-------------
 lib/staggered_prolong_restrict.cu |  8 ++++
 lib/transfer.cpp                  | 76 ++++++++++++++++++++-----------
 7 files changed, 149 insertions(+), 95 deletions(-)

diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index c121f86dca..9da7633071 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -29,9 +29,17 @@ namespace quda {
     static constexpr int coarseSpin = coarseSpin_;
     static constexpr int coarseColor = coarseColor_;
 
-    FieldOrderCB<Float,coarseSpin,coarseColor,1, colorspinor::getNative<Float>(coarseSpin)> out;
-    const FieldOrderCB<Float,fineSpin,fineColor,1, colorspinor::getNative<Float>(fineSpin)> in;
-    const FieldOrderCB<Float,fineSpin,fineColor,coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat> V;
+    // disable ghost to reduce arg size
+    using F = FieldOrderCB<Float, fineSpin, fineColor, 1, colorspinor::getNative<Float>(fineSpin), Float, Float, true>;
+    using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
+    using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat>;
+
+    static constexpr unsigned int max_n_src = 64;
+    const int_fastdiv n_src;
+
+    C out[max_n_src];
+    F in[max_n_src];
+    const V v;
     const int aggregate_size;    // number of sites that form a single aggregate
     const int_fastdiv aggregate_size_cb; // number of checkerboard sites that form a single aggregate
     const int *fine_to_coarse;
@@ -51,26 +59,37 @@ namespace quda {
     dim3 grid_dim;
     dim3 block_dim;
 
-    RestrictArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
+    RestrictArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
 		const int *fine_to_coarse, const int *coarse_to_fine, int parity) :
-      kernel_param(dim3(in.Volume()/out.Volume(), 1, coarseColor/coarse_colors_per_thread<fineColor, coarseColor>())),
-      out(out), in(in), V(V),
-      aggregate_size(in.Volume()/out.Volume()),
-      aggregate_size_cb(in.VolumeCB()/out.Volume()),
-      fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
-      spin_map(), parity(parity), nParity(in.SiteSubset()), swizzle_factor(1)
-    { }
+      kernel_param(dim3(in[0].Volume() / out[0].Volume(), 1, out.size() * (coarseColor / coarse_colors_per_thread<fineColor, coarseColor>()))),
+      n_src(out.size()),
+      v(v),
+      aggregate_size(in[0].Volume() / out[0].Volume()),
+      aggregate_size_cb(in[0].VolumeCB() / out[0].Volume()),
+      fine_to_coarse(fine_to_coarse),
+      coarse_to_fine(coarse_to_fine),
+      spin_map(),
+      parity(parity),
+      nParity(in[0].SiteSubset()),
+      swizzle_factor(1)
+    {
+      if (out.size() > max_n_src) errorQuda("vector set size %lu greater than max size %d", out.size(), max_n_src);
+      for (auto i = 0u; i < out.size(); i++) {
+        this->out[i] = out[i];
+        this->in[i] = in[i];
+      }
+    }
   };
 
   /**
      Rotates from the fine-color basis into the coarse-color basis.
   */
   template <typename Out, typename Arg>
-  __device__ __host__ inline void rotateCoarseColor(Out &out, const Arg &arg, int parity, int x_cb, int coarse_color_block)
+  __device__ __host__ inline void rotateCoarseColor(Out &out, const Arg &arg, int src_idx, int parity, int x_cb, int coarse_color_block)
   {
     constexpr int coarse_color_per_thread = coarse_colors_per_thread<Arg::fineColor, Arg::coarseColor>();
     const int spinor_parity = (arg.nParity == 2) ? parity : 0;
-    const int v_parity = (arg.V.Nparity() == 2) ? parity : 0;
+    const int v_parity = (arg.v.Nparity() == 2) ? parity : 0;
 
 #pragma unroll
     for (int s=0; s<Arg::fineSpin; s++)
@@ -95,7 +114,7 @@ namespace quda {
 	for (int j=0; j<Arg::fineColor; j+=color_unroll) {
 #pragma unroll
 	  for (int k=0; k<color_unroll; k++)
-	    partial[k] = cmac(conj(arg.V(v_parity, x_cb, s, j+k, i)), arg.in(spinor_parity, x_cb, s, j+k), partial[k]);
+	    partial[k] = cmac(conj(arg.v(v_parity, x_cb, s, j+k, i)), arg.in[src_idx](spinor_parity, x_cb, s, j+k), partial[k]);
 	}
 
 #pragma unroll
@@ -115,7 +134,9 @@ namespace quda {
     {
       int x_fine_offset = thread.x;
       const int x_coarse = block.x;
-      const int coarse_color_thread = block.z * arg.block_dim.z + thread.z;
+      const int src_coarse_color_thread = block.z * arg.block_dim.z + thread.z;
+      int src_idx = src_coarse_color_thread % arg.n_src;
+      int coarse_color_thread = src_coarse_color_thread / arg.n_src;
       const int coarse_color_block = coarse_color_thread * coarse_color_per_thread;
 
       vector reduced{0};
@@ -129,11 +150,11 @@ namespace quda {
         // with fine-point-id parity ordered
         const int x_fine_site_id = (x_coarse * 2 + parity) * arg.aggregate_size_cb + x_fine_cb_offset;
         const int x_fine = arg.coarse_to_fine[x_fine_site_id];
-        const int x_fine_cb = x_fine - parity * arg.in.VolumeCB();
+        const int x_fine_cb = x_fine - parity * arg.in[src_idx].VolumeCB();
 
         array<complex<typename Arg::real>, Arg::fineSpin * coarse_color_per_thread> tmp{0};
 
-        rotateCoarseColor(tmp, arg, parity, x_fine_cb, coarse_color_block);
+        rotateCoarseColor(tmp, arg, src_idx, parity, x_fine_cb, coarse_color_block);
 
         // perform any local spin coarsening
 #pragma unroll
@@ -155,15 +176,15 @@ namespace quda {
       reduced = BlockReduce<vector, block_dim, Arg::n_vector_z>(thread.z).Sum(reduced);
 
       if (target::thread_idx().x == 0) {
-        const int parity_coarse = x_coarse >= arg.out.VolumeCB() ? 1 : 0;
-        const int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
+        const int parity_coarse = x_coarse >= arg.out[src_idx].VolumeCB() ? 1 : 0;
+        const int x_coarse_cb = x_coarse - parity_coarse*arg.out[src_idx].VolumeCB();
 
 #pragma unroll
         for (int s = 0; s < Arg::coarseSpin; s++) {
 #pragma unroll
           for (int coarse_color_local=0; coarse_color_local<coarse_color_per_thread; coarse_color_local++) {
             int v = coarse_color_thread * coarse_color_per_thread + coarse_color_local;
-            arg.out(parity_coarse, x_coarse_cb, s, v) = reduced[s*coarse_color_per_thread+coarse_color_local];
+            arg.out[src_idx](parity_coarse, x_coarse_cb, s, v) = reduced[s*coarse_color_per_thread+coarse_color_local];
           }
         }
       }
diff --git a/include/transfer.h b/include/transfer.h
index d8f9485d18..3df7f19123 100644
--- a/include/transfer.h
+++ b/include/transfer.h
@@ -187,7 +187,7 @@ namespace quda {
      * @param out The resulting field on the coarse lattice
      * @param in The input field on the fine lattice
      */
-    void R(ColorSpinorField &out, const ColorSpinorField &in) const;
+    void R(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const;
 
     /**
      * @brief The precision of the packed null-space vectors
@@ -332,12 +332,12 @@ namespace quda {
      @param[in] spin_map Spin blocking lookup table
      @param[in] parity of the input fine field (if single parity input field)
    */
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse,
-                const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
+                const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
   template <int coarseColor, int fineColor>
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse,
-                const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
+                const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
   /**
      @brief Apply the unitary "prolongation" operator for Kahler-Dirac preconditioning
@@ -358,7 +358,7 @@ namespace quda {
      @param[in] spin_map Spin blocking lookup table
      @param[in] parity of the output fine field (if single parity output field)
    */
-  void StaggeredRestrict(ColorSpinorField &out, const ColorSpinorField &in, const int *fine_to_coarse,
+  void StaggeredRestrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const int *fine_to_coarse,
                          const int *const *spin_map, int parity = QUDA_INVALID_PARITY);
 
 } // namespace quda
diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index fcd73e5a92..bf4529dbf1 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -12,7 +12,7 @@ namespace quda
      @brief This derived tunable class is for block reduction kernels,
      and partners the BlockReduction2D kernel.  Each reduction block
      is mapped to x grid, with each block mapping to a thread block.
-     Each thread block is potentially two dimensional, with the y
+     Each thread block is potentially two dimensional, with the z
      dimension can be mapped to vector of computations, similar to
      TunableKernel2D.  Due to the exact mapping of reduction block to
      the thread blocks, no block-size tuning is performed in the x
diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp
index ac982fde69..2f6d2697e0 100644
--- a/lib/restrictor.in.cpp
+++ b/lib/restrictor.in.cpp
@@ -7,10 +7,10 @@ namespace quda
   };
 
   template <int fineColor, int coarseColor, int... N>
-  void Restrict2(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse,
-                 const int *coarse_to_fine, const int *const *spin_map, int parity, IntList<coarseColor, N...>)
+  void Restrict2(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
+                 const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity, IntList<coarseColor, N...>)
   {
-    if (out.Ncolor() == coarseColor) {
+    if (out[0].Ncolor() == coarseColor) {
       if constexpr (coarseColor >= fineColor) {
         Restrict<fineColor, coarseColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity);
       } else {
@@ -20,16 +20,16 @@ namespace quda
       if constexpr (sizeof...(N) > 0) {
         Restrict2<fineColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, IntList<N...>());
       } else {
-        errorQuda("Coarse Nc = %d has not been instantiated", out.Ncolor());
+        errorQuda("Coarse Nc = %d has not been instantiated", out[0].Ncolor());
       }
     }
   }
 
   template <int fineColor, int... N>
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse,
-                const int *coarse_to_fine, const int *const *spin_map, int parity, IntList<fineColor, N...>)
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
+                const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity, IntList<fineColor, N...>)
   {
-    if (in.Ncolor() == fineColor) {
+    if (in[0].Ncolor() == fineColor) {
       // clang-format off
       IntList<@QUDA_MULTIGRID_NVEC_LIST@> coarseColors;
       // clang-format on
@@ -38,13 +38,13 @@ namespace quda
       if constexpr (sizeof...(N) > 0) {
         Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, IntList<N...>());
       } else {
-        errorQuda("Fine Nc = %d has not been instantiated", in.Ncolor());
+        errorQuda("Fine Nc = %d has not been instantiated", in[0].Ncolor());
       }
     }
   }
 
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse,
-                const int *coarse_to_fine, const int *const *spin_map, int parity)
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
+                const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity)
   {
     if constexpr (is_enabled_multigrid()) {
       // clang-format off
diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index d1c2af1d4d..27a1bd5c5d 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -16,8 +16,8 @@ namespace quda {
   template <typename Float, typename vFloat, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
   class RestrictLaunch : public TunableBlock2D {
     using Arg = RestrictArg<Float, vFloat, fineSpin, fineColor, coarseSpin, coarseColor>;
-    ColorSpinorField &out;
-    const ColorSpinorField &in;
+    cvector_ref<ColorSpinorField> &out;
+    cvector_ref<const ColorSpinorField> &in;
     const ColorSpinorField &v;
     const int *fine_to_coarse;
     const int *coarse_to_fine;
@@ -25,19 +25,19 @@ namespace quda {
 
     bool tuneSharedBytes() const { return false; }
     bool tuneAuxDim() const { return true; }
-    unsigned int minThreads() const { return in.Volume(); } // fine parity is the block y dimension
+    unsigned int minThreads() const { return in[0].Volume(); } // fine parity is the block y dimension
 
   public:
-    RestrictLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+    RestrictLaunch(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                    const int *fine_to_coarse, const int *coarse_to_fine, int parity) :
-      TunableBlock2D(in, false, coarseColor / coarse_colors_per_thread<fineColor, coarseColor>(), max_z_block()),
+      TunableBlock2D(in[0], false, out.size() * (coarseColor / coarse_colors_per_thread<fineColor, coarseColor>()), max_z_block()),
       out(out), in(in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
       parity(parity)
     {
       strcat(vol, ",");
-      strcat(vol, out.VolString().c_str());
+      strcat(vol, out[0].VolString().c_str());
       strcat(aux, ",");
-      strcat(aux, out.AuxString().c_str());
+      strcat(aux, out[0].AuxString().c_str());
 
       apply(device::get_default_stream());
     }
@@ -45,7 +45,7 @@ namespace quda {
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      if (checkNative(out, in, v)) {
+      if (checkNative(out[0], in[0], v)) {
         Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
         arg.swizzle_factor = tp.aux.x;
         launch<Restrictor, Aggregates>(tp, stream, arg);
@@ -75,42 +75,45 @@ namespace quda {
      */
     unsigned int blockMapper() const
     {
-      auto aggregate_size = in.Volume() / out.Volume();
+      auto aggregate_size = in[0].Volume() / out[0].Volume();
       auto max_block = 128u;
       for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size < b) return b;
       return max_block;
     }
 
-    void initTuneParam(TuneParam &param) const {
+    void initTuneParam(TuneParam &param) const
+    {
       TunableBlock2D::initTuneParam(param);
       param.block.x = blockMapper();
-      param.grid.x = out.Volume();
+      param.grid.x = out[0].Volume();
       param.shared_bytes = 0;
       param.aux.x = 2; // swizzle factor
     }
 
-    void defaultTuneParam(TuneParam &param) const {
+    void defaultTuneParam(TuneParam &param) const
+    {
       TunableBlock2D::defaultTuneParam(param);
       param.block.x = blockMapper();
-      param.grid.x = out.Volume();
+      param.grid.x = out[0].Volume();
       param.shared_bytes = 0;
       param.aux.x = 2; // swizzle factor
     }
 
-    long long flops() const { return 8 * fineSpin * fineColor * coarseColor * in.SiteSubset()*(long long)in.VolumeCB(); }
+    long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset()*in[0].VolumeCB(); }
 
-    long long bytes() const {
-      size_t v_bytes = v.Bytes() / (v.SiteSubset() == in.SiteSubset() ? 1 : 2);
-      return in.Bytes() + out.Bytes() + v_bytes + in.SiteSubset()*in.VolumeCB()*sizeof(int);
+    long long bytes() const
+    {
+      size_t v_bytes = v.Bytes() / (v.SiteSubset() == in[0].SiteSubset() ? 1 : 2);
+      return out.size() * (in[0].Bytes() + out[0].Bytes() + v_bytes + in[0].SiteSubset() * in[0].VolumeCB() * sizeof(int));
     }
 
   };
 
   template <typename Float, int fineSpin, int fineColor, int coarseColor>
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                 const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity)
   {
-    if (out.Nspin() != 2) errorQuda("Unsupported nSpin %d", out.Nspin());
+    if (out[0].Nspin() != 2) errorQuda("Unsupported nSpin %d", out[0].Nspin());
     constexpr int coarseSpin = 2;
 
     // first check that the spin_map matches the spin_mapper
@@ -126,7 +129,7 @@ namespace quda {
       } else {
         errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
       }
-    } else if (v.Precision() == in.Precision()) {
+    } else if (v.Precision() == in[0].Precision()) {
       RestrictLaunch<Float, Float, fineSpin, fineColor, coarseSpin, coarseColor>
         restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
     } else {
@@ -135,25 +138,25 @@ namespace quda {
   }
 
   template <typename Float, int fineColor, int coarseColor>
-  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Restrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                 const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity)
   {
-    if (!is_enabled_spin(in.Nspin())) errorQuda("nSpin %d has not been built", in.Nspin());
+    if (!is_enabled_spin(in[0].Nspin())) errorQuda("nSpin %d has not been built", in[0].Nspin());
 
-    if (in.Nspin() == 2) {
+    if (in[0].Nspin() == 2) {
       Restrict<Float, 2, fineColor, coarseColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity);
     } else if constexpr (fineColor == 3) {
-      if (in.Nspin() == 4) {
+      if (in[0].Nspin() == 4) {
         if constexpr (is_enabled_spin(4))
           Restrict<Float, 4, fineColor, coarseColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity);
-      } else if (in.Nspin() == 1) {
+      } else if (in[0].Nspin() == 1) {
         if constexpr (is_enabled_spin(1))
           Restrict<Float, 1, fineColor, coarseColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity);
       } else {
-        errorQuda("Unexpected nSpin = %d", in.Nspin());
+        errorQuda("Unexpected nSpin = %d", in[0].Nspin());
       }
     } else {
-      errorQuda("Unexpected spin %d and color %d combination", in.Nspin(), in.Ncolor());
+      errorQuda("Unexpected spin %d and color %d combination", in[0].Nspin(), in[0].Ncolor());
     }
   }
 
@@ -161,11 +164,11 @@ namespace quda {
   constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@;
 
   template <>
-  void Restrict<fineColor, coarseColor>(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
+  void Restrict<fineColor, coarseColor>(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                                         const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity)
   {
-    checkLocation(out, in, v);
-    QudaPrecision precision = checkPrecision(out, in);
+    checkLocation(out[0], in[0], v);
+    QudaPrecision precision = checkPrecision(out[0], in[0]);
 
     if constexpr (is_enabled_multigrid()) {
       if (precision == QUDA_DOUBLE_PRECISION) {
@@ -175,7 +178,7 @@ namespace quda {
       } else if (precision == QUDA_SINGLE_PRECISION) {
         Restrict<float, fineColor, coarseColor>(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity);
       } else {
-        errorQuda("Unsupported precision %d", out.Precision());
+        errorQuda("Unsupported precision %d", out[0].Precision());
       }
     } else {
       errorQuda("Multigrid has not been built");
diff --git a/lib/staggered_prolong_restrict.cu b/lib/staggered_prolong_restrict.cu
index a8ae5ad604..0303817456 100644
--- a/lib/staggered_prolong_restrict.cu
+++ b/lib/staggered_prolong_restrict.cu
@@ -122,6 +122,14 @@ namespace quda {
   void StaggeredRestrict(ColorSpinorField &, const ColorSpinorField &, const int *, const int * const *, int) { errorQuda("Staggered multigrid has not been build"); }
 #endif
 
+  void StaggeredRestrict(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                         const int *fine_to_coarse, const int * const * spin_map, int parity)
+  {
+    for (auto i = 0u; i < out.size(); i++) {
+      StaggeredProlongateRestrict<StaggeredTransferType::STAGGERED_TRANSFER_RESTRICT>(out[i], in[i], fine_to_coarse, spin_map, parity);
+    }
+  }
+
   void StaggeredProlongate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                            const int *fine_to_coarse, const int * const * spin_map, int parity)
   {
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index 3ae1a415bd..895055219c 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -362,9 +362,9 @@ namespace quda {
       }
 
     } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) {
+
       std::vector<ColorSpinorField> input(in.size());
       std::vector<ColorSpinorField> output(out.size());
-
       const ColorSpinorField *V = use_gpu ? V_d : V_h;
 
       if (use_gpu) {
@@ -378,7 +378,7 @@ namespace quda {
 
         // set output fields
         if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -390,7 +390,7 @@ namespace quda {
 
         // set output fields
         if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
+          for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -418,64 +418,86 @@ namespace quda {
   }
 
   // apply the restrictor
-  void Transfer::R(ColorSpinorField &out, const ColorSpinorField &in) const
+  void Transfer::R(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
   {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    ColorSpinorField *input = &const_cast<ColorSpinorField&>(in);
-    ColorSpinorField *output = &out;
     initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1);
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
     const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
 
     if (transfer_type == QUDA_TRANSFER_COARSE_KD) {
-      StaggeredRestrict(*output, *input, fine_to_coarse, spin_map, parity);
+      StaggeredRestrict(out, in, fine_to_coarse, spin_map, parity);
     } else if (transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) {
 
-      if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors");
-
-      if (output->VolumeCB() != input->VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes");
+      if (out[0].SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors");
+      if (out[0].VolumeCB() != in[0].VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes");
 
       // the optimized KD op acts on fine spinors
-      if (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET) {
-        output->Even() = *input;
-        blas::zero(output->Odd());
+      if (in[0].SiteSubset() == QUDA_PARITY_SITE_SUBSET) {
+        for (auto i = 0u; i < out.size(); i++) out[i].Even() = in[i];
+        for (auto i = 0u; i < out.size(); i++) blas::zero(out[i].Odd());
       } else {
-        *output = *input;
+        for (auto i = 0u; i < out.size(); i++) out[i] = in[i];
       }
+
     } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) {
 
+      std::vector<ColorSpinorField> input(in.size());
+      std::vector<ColorSpinorField> output(out.size());
       const ColorSpinorField *V = use_gpu ? V_d : V_h;
 
       if (use_gpu) {
-        if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = &coarse_tmp_d[0];
-        if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis())
-          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_d[0] : &fine_tmp_d[0].Even();
+
+        // set input fields
+        if (in[0].Location() == QUDA_CPU_FIELD_LOCATION || in[0].GammaBasis() != V->GammaBasis()) {
+          for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
+        } else {
+          for (auto i = 0u; i < out.size(); i++) input[i] = const_cast<ColorSpinorField &>(in[i]).create_alias();
+        }
+        if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
+
+        // set output fields
+        if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) {
+          for (auto i = 0u; i < out.size(); i++) output[i] = coarse_tmp_d[i].create_alias();
+        } else {
+          for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
+        }
         if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
       } else {
-        if (in.Location() == QUDA_CUDA_FIELD_LOCATION)
-          input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_h[0] : &fine_tmp_h[0].Even();
+
+        // set input fields
+        if (in[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
+          for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
+        } else {
+          for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField &>(in[i]).create_alias();
+        }
+
+        // set output fields
+        // set input fields
+        for (auto i = 0u; i < out.size(); i++) output[i] = const_cast<ColorSpinorField&>(out[i]).create_alias();
+
       }
 
-      *input = in;
+      for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not?
 
-      if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in.SiteSubset() == QUDA_FULL_SITE_SUBSET)
+      if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in[0].SiteSubset() == QUDA_FULL_SITE_SUBSET)
         errorQuda("Cannot restrict a full field since only have single parity null-space components");
 
-      if (V->Nspin() != 1 && (output->GammaBasis() != V->GammaBasis() || input->GammaBasis() != V->GammaBasis()))
+      if (V->Nspin() != 1 && (output[0].GammaBasis() != V->GammaBasis() || input[0].GammaBasis() != V->GammaBasis()))
         errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d",
-                  out.GammaBasis(), input->GammaBasis(), V->GammaBasis());
+                  out[0].GammaBasis(), input[0].GammaBasis(), V->GammaBasis());
 
-      Restrict(*output, *input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity);
+      Restrict(output, input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity);
+
+      for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically)
 
     } else {
       errorQuda("Invalid transfer type in restrict");
     }
 
-    out = *output; // copy result to out field (aliasing handled automatically)
-
     // only need to synchronize if we're transferring from GPU to CPU
-    if (out.Location() == QUDA_CPU_FIELD_LOCATION && in.Location() == QUDA_CUDA_FIELD_LOCATION)
+    if (out[0].Location() == QUDA_CPU_FIELD_LOCATION && in[0].Location() == QUDA_CUDA_FIELD_LOCATION)
       qudaDeviceSynchronize();
 
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);

From 7122947b69b88fdf77a4723b63cebabb177499bb Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Jan 2024 09:00:57 -0800
Subject: [PATCH 05/21] Multigrid now stores its null space vectors using
 objects not pointers

---
 include/kernels/block_orthogonalize.cuh |   2 +-
 include/multigrid.h                     |  72 +++---
 include/transfer.h                      |  12 +-
 lib/block_orthogonalize.in.cpp          |   6 +-
 lib/block_orthogonalize.in.cu           |  48 ++--
 lib/interface_quda.cpp                  |   2 +-
 lib/multigrid.cpp                       | 331 +++++++++++-------------
 lib/transfer.cpp                        |  68 ++---
 8 files changed, 256 insertions(+), 285 deletions(-)

diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index d2d8c3cd6f..bec77273b7 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -69,7 +69,7 @@ namespace quda {
       nParity(meta.SiteSubset()),
       nBlockOrtho(n_block_ortho),
       fineVolumeCB(meta.VolumeCB()),
-      B{*B...},
+      B{B...},
       grid_dim(),
       block_dim()
     {
diff --git a/include/multigrid.h b/include/multigrid.h
index a3ec2ce491..16250e5bdc 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -112,7 +112,7 @@ namespace quda {
     MG *fine;
 
     /** The null space vectors */
-    std::vector<ColorSpinorField*> &B;
+    std::vector<ColorSpinorField> &B;
 
     /** Number of pre-smoothing applications to perform */
     int nu_pre;
@@ -172,7 +172,7 @@ namespace quda {
     /**
        This is top level instantiation done when we start creating the multigrid operator.
      */
-    MGParam(QudaMultigridParam &param, std::vector<ColorSpinorField *> &B, DiracMatrix *matResidual,
+    MGParam(QudaMultigridParam &param, std::vector<ColorSpinorField> &B, DiracMatrix *matResidual,
             DiracMatrix *matSmooth, DiracMatrix *matSmoothSloppy, int level = 0) :
       SolverParam(*(param.invert_param)),
       mg_global(param),
@@ -208,7 +208,7 @@ namespace quda {
       omega = param.omega[level];
     }
 
-    MGParam(const MGParam &param, std::vector<ColorSpinorField *> &B, DiracMatrix *matResidual, DiracMatrix *matSmooth,
+    MGParam(const MGParam &param, std::vector<ColorSpinorField> &B, DiracMatrix *matResidual, DiracMatrix *matSmooth,
             DiracMatrix *matSmoothSloppy, int level = 0) :
       SolverParam(param),
       mg_global(param.mg_global),
@@ -257,13 +257,15 @@ namespace quda {
     MGParam &param;
 
     /** This is the transfer operator that defines the prolongation and restriction operators */
-    Transfer *transfer;
+    Transfer *transfer = nullptr;
 
     /** This tell to reset() if transfer needs to be rebuilt */
-    bool resetTransfer;
+    bool resetTransfer = false;
 
     /** This is the smoother used */
-    Solver *presmoother, *postsmoother;
+    Solver *presmoother = nullptr;
+
+    Solver *postsmoother = nullptr;
 
     /** Prefix label used for printf at this level */
     char prefix[128];
@@ -272,43 +274,43 @@ namespace quda {
     char coarse_prefix[128];
 
     /** This is the next lower level */
-    MG *coarse;
+    MG *coarse = nullptr;
 
     /** The coarse grid solver - this either points at "coarse" or a solver preconditioned by "coarse" */
-    Solver *coarse_solver;
+    Solver *coarse_solver = nullptr;
 
     /** Storage for the parameter struct for the coarse grid */
-    MGParam *param_coarse;
+    MGParam *param_coarse = nullptr;
 
     /** Storage for the parameter struct for the pre-smoother */
-    SolverParam *param_presmooth;
+    SolverParam *param_presmooth = nullptr;
 
     /** Storage for the parameter struct for the post-smoother */
-    SolverParam *param_postsmooth;
+    SolverParam *param_postsmooth = nullptr;
 
     /** Storage for the parameter struct for the coarse solver */
-    SolverParam *param_coarse_solver;
+    SolverParam *param_coarse_solver = nullptr;
 
     /** The coarse-grid representation of the null space vectors */
-    std::vector<ColorSpinorField*> *B_coarse;
+    std::vector<ColorSpinorField> B_coarse;
 
     /** Residual vector */
-    ColorSpinorField *r;
+    ColorSpinorField *r = nullptr;
 
     /** Projected source vector for preconditioned system, else just points to source */
-    ColorSpinorField *b_tilde;
+    ColorSpinorField *b_tilde = nullptr;
 
     /** Coarse residual vector */
-    ColorSpinorField *r_coarse;
+    ColorSpinorField *r_coarse = nullptr;
 
     /** Coarse solution vector */
-    ColorSpinorField *x_coarse;
+    ColorSpinorField *x_coarse = nullptr;
 
     /** Coarse temporary vector */
-    ColorSpinorField *tmp_coarse;
+    ColorSpinorField *tmp_coarse = nullptr;
 
     /** Sloppy coarse temporary vector */
-    ColorSpinorField *tmp_coarse_sloppy;
+    ColorSpinorField *tmp_coarse_sloppy = nullptr;
 
     /** Kahler-Dirac Xinv */
     std::shared_ptr<GaugeField> xInvKD;
@@ -317,34 +319,34 @@ namespace quda {
     std::shared_ptr<GaugeField> xInvKD_sloppy;
 
     /** The fine operator used for computing inter-grid residuals */
-    const Dirac *diracResidual;
+    const Dirac *diracResidual = nullptr;
 
     /** The fine operator used for doing smoothing */
-    const Dirac *diracSmoother;
+    const Dirac *diracSmoother = nullptr;
 
     /** The fine operator used for doing sloppy smoothing */
-    const Dirac *diracSmootherSloppy;
+    const Dirac *diracSmootherSloppy = nullptr;
 
     /** The coarse operator used for computing inter-grid residuals */
-    Dirac *diracCoarseResidual;
+    Dirac *diracCoarseResidual = nullptr;
 
     /** The coarse operator used for doing smoothing */
-    Dirac *diracCoarseSmoother;
+    Dirac *diracCoarseSmoother = nullptr;
 
     /** The coarse operator used for doing sloppy smoothing */
-    Dirac *diracCoarseSmootherSloppy;
+    Dirac *diracCoarseSmootherSloppy = nullptr;
 
     /** Wrapper for the residual coarse grid operator */
-    DiracMatrix *matCoarseResidual;
+    DiracMatrix *matCoarseResidual = nullptr;
 
     /** Wrapper for the smoothing coarse grid operator */
-    DiracMatrix *matCoarseSmoother;
+    DiracMatrix *matCoarseSmoother = nullptr;
 
     /** Wrapper for the sloppy smoothing coarse grid operator */
-    DiracMatrix *matCoarseSmootherSloppy;
+    DiracMatrix *matCoarseSmootherSloppy = nullptr;
 
     /** Parallel hyper-cubic random number generator for generating null-space vectors */
-    RNG *rng;
+    RNG *rng = nullptr;
 
     /**
        @brief Helper function called on entry to each MG function
@@ -453,20 +455,20 @@ namespace quda {
        @brief Load the null space vectors in from file
        @param B Loaded null-space vectors (pre-allocated)
     */
-    void loadVectors(std::vector<ColorSpinorField *> &B);
+    void loadVectors(cvector_ref<ColorSpinorField> &B);
 
     /**
        @brief Save the null space vectors in from file
        @param B Save null-space vectors from here
     */
-    void saveVectors(const std::vector<ColorSpinorField *> &B) const;
+    void saveVectors(cvector_ref<const ColorSpinorField> &B) const;
 
     /**
        @brief Generate the null-space vectors
        @param B Generated null-space vectors
        @param refresh Whether we refreshing pre-exising vectors or starting afresh
     */
-    void generateNullVectors(std::vector<ColorSpinorField*> &B, bool refresh=false);
+    void generateNullVectors(std::vector<ColorSpinorField> &B, bool refresh = false);
 
     /**
        @brief Generate lowest eigenvectors
@@ -477,7 +479,7 @@ namespace quda {
        @brief Build free-field null-space vectors
        @param B Free-field null-space vectors
     */
-    void buildFreeVectors(std::vector<ColorSpinorField*> &B);
+    void buildFreeVectors(std::vector<ColorSpinorField> &B);
 
     /**
       @brief Return if we're on a fine grid right now
@@ -724,7 +726,7 @@ namespace quda {
     DiracM *mSmooth;
     DiracM *mSmoothSloppy;
 
-    std::vector<ColorSpinorField*> B;
+    std::vector<ColorSpinorField> B;
 
     MGParam *mgParam;
 
@@ -739,8 +741,6 @@ namespace quda {
 
       if (mgParam) delete mgParam;
 
-      for (unsigned int i=0; i<B.size(); i++) delete B[i];
-
       if (m) delete m;
       if (mSmooth) delete mSmooth;
       if (mSmoothSloppy) delete mSmoothSloppy;
diff --git a/include/transfer.h b/include/transfer.h
index 3df7f19123..22c4ae1a88 100644
--- a/include/transfer.h
+++ b/include/transfer.h
@@ -32,7 +32,7 @@ namespace quda {
   private:
 
     /** The raw null space components */
-    const std::vector<ColorSpinorField*> &B;
+    const std::vector<ColorSpinorField> &B;
 
     /** The number of null space components */
     const int Nvec;
@@ -164,7 +164,7 @@ namespace quda {
      * @param null_precision The precision to store the null-space basis vectors in
      * @param enable_gpu Whether to enable this to run on GPU (as well as CPU)
      */
-    Transfer(const std::vector<ColorSpinorField *> &B, int Nvec, int NblockOrtho, bool blockOrthoTwoPass, int *geo_bs,
+    Transfer(const std::vector<ColorSpinorField> &B, int Nvec, int NblockOrtho, bool blockOrthoTwoPass, int *geo_bs,
              int spin_bs, QudaPrecision null_precision, const QudaTransferType transfer_type);
 
     /** The destructor for Transfer */
@@ -194,7 +194,7 @@ namespace quda {
      */
     QudaPrecision NullPrecision(QudaFieldLocation location) const
     {
-      return location == QUDA_CUDA_FIELD_LOCATION ? null_precision : std::max(B[0]->Precision(), QUDA_SINGLE_PRECISION);
+      return location == QUDA_CUDA_FIELD_LOCATION ? null_precision : std::max(B[0].Precision(), QUDA_SINGLE_PRECISION);
     }
 
     /**
@@ -205,7 +205,7 @@ namespace quda {
     const ColorSpinorField& Vectors(QudaFieldLocation location=QUDA_INVALID_FIELD_LOCATION) const {
       if (location == QUDA_INVALID_FIELD_LOCATION) {
         // if not set then we return the memory space where the input vectors are stored
-        return B[0]->Location() == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h;
+        return B[0].Location() == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h;
       } else {
         return location == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h;
       }
@@ -281,11 +281,11 @@ namespace quda {
      calculation.  This this provides better accuracy in fixed-point
      precision.
    */
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass);
 
   template <int coarseColor, int fineColor>
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass);
 
   /**
diff --git a/lib/block_orthogonalize.in.cpp b/lib/block_orthogonalize.in.cpp
index 3912cc0a69..1f3051bbd2 100644
--- a/lib/block_orthogonalize.in.cpp
+++ b/lib/block_orthogonalize.in.cpp
@@ -7,7 +7,7 @@ namespace quda
   };
 
   template <int fineColor, int coarseColor, int... N>
-  void BlockOrthogonalize2(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize2(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                            const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass,
                            IntList<coarseColor, N...>)
   {
@@ -29,7 +29,7 @@ namespace quda
   }
 
   template <int fineColor, int... N>
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass,
                           IntList<fineColor, N...>)
   {
@@ -49,7 +49,7 @@ namespace quda
     }
   }
 
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass)
   {
     if constexpr (is_enabled_multigrid()) {
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index 64651fb55f..e4e3a9e12c 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -49,7 +49,7 @@ namespace quda {
     template <bool is_device, typename Rotator, typename Vector> using Arg = BlockOrthoArg<is_device, vFloat, Rotator, Vector, nSpin, nColor, coarseSpin, nVec>;
 
     ColorSpinorField &V;
-    const std::vector<ColorSpinorField*> B;
+    const std::vector<ColorSpinorField> &B;
     const int *fine_to_coarse;
     const int *coarse_to_fine;
     const int *geo_bs;
@@ -61,7 +61,7 @@ namespace quda {
     double max;
 
   public:
-    BlockOrtho(ColorSpinorField &V, const std::vector<ColorSpinorField *> B, const int *fine_to_coarse,
+    BlockOrtho(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                const int *coarse_to_fine, const int *geo_bs, int n_block_ortho, bool two_pass) :
       TunableBlock2D(V, false, chiral_blocks),
       V(V),
@@ -113,8 +113,8 @@ namespace quda {
     }
 
     template <typename Rotator, typename Vector, std::size_t... S>
-    void launch_host_(const TuneParam &tp, const qudaStream_t &stream,
-                     const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>)
+    void launch_host_(const TuneParam &tp, const qudaStream_t &stream, const std::vector<ColorSpinorField> &B,
+                      std::index_sequence<S...>)
     {
       Arg<false, Rotator, Vector> arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
       launch_host<BlockOrtho_, OrthoAggregates>(tp, stream, arg);
@@ -122,8 +122,8 @@ namespace quda {
     }
 
     template <typename Rotator, typename Vector, std::size_t... S>
-    void launch_device_(const TuneParam &tp, const qudaStream_t &stream,
-                        const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>)
+    void launch_device_(const TuneParam &tp, const qudaStream_t &stream, const std::vector<ColorSpinorField> &B,
+                        std::index_sequence<S...>)
     {
       Arg<true, Rotator, Vector> arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
       arg.swizzle_factor = tp.aux.x;
@@ -136,7 +136,8 @@ namespace quda {
       constexpr bool disable_ghost = true;
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (V.Location() == QUDA_CPU_FIELD_LOCATION) {
-        if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER && B[0]->FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
+        if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
+            && B[0].FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
           typedef FieldOrderCB<real,nSpin,nColor,nVec,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,vFloat,vFloat,disable_ghost> Rotator;
           typedef FieldOrderCB<real,nSpin,nColor,1,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,bFloat,bFloat,disable_ghost> Vector;
           launch_host_<Rotator, Vector>(tp, stream, B, std::make_index_sequence<nVec>());
@@ -146,12 +147,12 @@ namespace quda {
       } else {
         constexpr auto vOrder = colorspinor::getNative<vFloat>(nSpin);
         constexpr auto bOrder = colorspinor::getNative<bFloat>(nSpin);
-        if (V.FieldOrder() == vOrder && B[0]->FieldOrder() == bOrder) {
+        if (V.FieldOrder() == vOrder && B[0].FieldOrder() == bOrder) {
           typedef FieldOrderCB<real,nSpin,nColor,nVec,vOrder,vFloat,vFloat,disable_ghost> Rotator;
           typedef FieldOrderCB<real,nSpin,nColor,1,bOrder,bFloat,bFloat,disable_ghost,isFixed<bFloat>::value> Vector;
           launch_device_<Rotator, Vector>(tp, stream, B, std::make_index_sequence<nVec>());
         } else {
-          errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0]->FieldOrder());
+          errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0].FieldOrder());
         }
       }
     }
@@ -203,7 +204,7 @@ namespace quda {
 
     long long bytes() const
     {
-      return nVec * B[0]->Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()
+      return nVec * B[0].Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()
         + (n_block_ortho - 1) * (V.Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes());
     }
 
@@ -212,7 +213,7 @@ namespace quda {
   };
 
   template <typename vFloat, typename bFloat, int nSpin, int spinBlockSize, int nColor, int nVec>
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int n_block_ortho, bool two_pass)
   {
     int geo_blocksize = 1;
@@ -233,7 +234,7 @@ namespace quda {
   }
 
   template <typename vFloat, typename bFloat, int fineColor, int coarseColor>
-  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
+  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse,
                           const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass)
   {
     if (!is_enabled_spin(V.Nspin())) errorQuda("nSpin %d has not been built", V.Nspin());
@@ -270,34 +271,35 @@ namespace quda {
   constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@;
 
   template <>
-  void BlockOrthogonalize<fineColor, coarseColor>(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
-                                                  const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass)
+  void BlockOrthogonalize<fineColor, coarseColor>(ColorSpinorField &V, const std::vector<ColorSpinorField> &B,
+                                                  const int *fine_to_coarse, const int *coarse_to_fine,
+                                                  const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass)
   {
-    if (!is_enabled(V.Precision()) || !is_enabled(B[0]->Precision()))
-      errorQuda("QUDA_PRECISION=%d does not enable required precision combination (V = %d B = %d)",
-                QUDA_PRECISION, V.Precision(), B[0]->Precision());
+    if (!is_enabled(V.Precision()) || !is_enabled(B[0].Precision()))
+      errorQuda("QUDA_PRECISION=%d does not enable required precision combination (V = %d B = %d)", QUDA_PRECISION,
+                V.Precision(), B[0].Precision());
 
     if constexpr (is_enabled_multigrid()) {
-      if (B[0]->data() == nullptr) {
+      if (B[0].data() == nullptr) {
         warningQuda("Trying to BlockOrthogonalize staggered transform, skipping...");
         return;
       }
-      if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0]->Precision() == QUDA_DOUBLE_PRECISION) {
+      if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0].Precision() == QUDA_DOUBLE_PRECISION) {
         if constexpr (is_enabled_multigrid_double())
           BlockOrthogonalize<double, double, fineColor, coarseColor>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass);
         else
           errorQuda("Double precision multigrid has not been enabled");
-      } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
+      } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0].Precision() == QUDA_SINGLE_PRECISION) {
         if constexpr (is_enabled(QUDA_SINGLE_PRECISION))
           BlockOrthogonalize<float, float, fineColor, coarseColor>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass);
-      } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
+      } else if (V.Precision() == QUDA_HALF_PRECISION && B[0].Precision() == QUDA_SINGLE_PRECISION) {
         if constexpr (is_enabled(QUDA_HALF_PRECISION) && is_enabled(QUDA_SINGLE_PRECISION))
           BlockOrthogonalize<short, float, fineColor, coarseColor>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass);
-      } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_HALF_PRECISION) {
+      } else if (V.Precision() == QUDA_HALF_PRECISION && B[0].Precision() == QUDA_HALF_PRECISION) {
         if constexpr (is_enabled(QUDA_HALF_PRECISION))
           BlockOrthogonalize<short, short, fineColor, coarseColor>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass);
       } else {
-        errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0]->Precision());
+        errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0].Precision());
       }
     } else {
       errorQuda("Multigrid has not been built");
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 995b5758f4..3313db26fe 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -2376,7 +2376,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param)
     csParam.create = QUDA_REFERENCE_FIELD_CREATE;
   }
 
-  for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); }
+  for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField(csParam); }
 
   // fill out the MG parameters for the fine level
   mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy);
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index 89403aef04..03247e2356 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -18,34 +18,11 @@ namespace quda
   MG::MG(MGParam &param) :
     Solver(*param.matResidual, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, param),
     param(param),
-    transfer(0),
-    resetTransfer(false),
-    presmoother(nullptr),
-    postsmoother(nullptr),
-    coarse(nullptr),
-    coarse_solver(nullptr),
-    param_coarse(nullptr),
-    param_presmooth(nullptr),
-    param_postsmooth(nullptr),
-    param_coarse_solver(nullptr),
-    r(nullptr),
-    b_tilde(nullptr),
-    r_coarse(nullptr),
-    x_coarse(nullptr),
-    tmp_coarse(nullptr),
-    tmp_coarse_sloppy(nullptr),
     xInvKD(nullptr),
     xInvKD_sloppy(nullptr),
     diracResidual(param.matResidual->Expose()),
     diracSmoother(param.matSmooth->Expose()),
-    diracSmootherSloppy(param.matSmoothSloppy->Expose()),
-    diracCoarseResidual(nullptr),
-    diracCoarseSmoother(nullptr),
-    diracCoarseSmootherSloppy(nullptr),
-    matCoarseResidual(nullptr),
-    matCoarseSmoother(nullptr),
-    matCoarseSmootherSloppy(nullptr),
-    rng(nullptr)
+    diracSmootherSloppy(param.matSmoothSloppy->Expose())
   {
     sprintf(prefix, "MG level %d (%s): ", param.level, param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
     pushLevel(param.level);
@@ -59,7 +36,7 @@ namespace quda
     // allocating vectors
     {
       // create residual vectors
-      ColorSpinorParam csParam(*(param.B[0]));
+      ColorSpinorParam csParam(param.B[0]);
       csParam.create = QUDA_NULL_FIELD_CREATE;
       csParam.location = param.location;
       csParam.setPrecision(param.mg_global.invert_param->cuda_prec_sloppy, QUDA_INVALID_PRECISION,
@@ -67,7 +44,8 @@ namespace quda
       if (csParam.location==QUDA_CUDA_FIELD_LOCATION) {
         csParam.gammaBasis = param.level > 0 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS: QUDA_UKQCD_GAMMA_BASIS;
       }
-      if (param.B[0]->Nspin() == 1) csParam.gammaBasis = param.B[0]->GammaBasis(); // hack for staggered to avoid unnecessary basis checks
+      if (param.B[0].Nspin() == 1)
+        csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks
       r = new ColorSpinorField(csParam);
 
       // if we're using preconditioning then allocate storage for the preconditioned source vector
@@ -78,7 +56,7 @@ namespace quda
       }
     }
 
-    rng = new RNG(*param.B[0], 1234);
+    rng = new RNG(param.B[0], 1234);
 
     if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) {
       if (param.level < param.Nlevel - 1) {
@@ -88,7 +66,7 @@ namespace quda
             // Initializing to random vectors
             for (int i = 0; i < (int)param.B.size(); i++) {
               spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM);
-              *param.B[i] = *r;
+              param.B[i] = *r;
             }
           }
           if (param.mg_global.num_setup_iter[param.level] > 0) {
@@ -122,7 +100,7 @@ namespace quda
   void MG::reset(bool refresh) {
     pushLevel(param.level);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s level %d\n", transfer ? "Resetting" : "Creating", param.level);
+    logQuda(QUDA_VERBOSE, "%s level %d\n", transfer ? "Resetting" : "Creating", param.level);
 
     destroySmoother();
     destroyCoarseSolver();
@@ -153,50 +131,51 @@ namespace quda
         }
       } else {
         // create transfer operator
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating transfer operator\n");
+        logQuda(QUDA_VERBOSE, "Creating transfer operator\n");
         transfer = new Transfer(param.B, param.Nvec, param.NblockOrtho, param.blockOrthoTwoPass, param.geoBlockSize,
                                 param.spinBlockSize, param.mg_global.precision_null[param.level],
                                 param.mg_global.transfer_type[param.level]);
-        for (int i=0; i<QUDA_MAX_MG_LEVEL; i++) param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i];
+        for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++)
+          param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i];
 
         // create coarse temporary vector if not already created in verify()
         if (!tmp_coarse)
-          tmp_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                                param.mg_global.location[param.level + 1]);
+          tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                               param.mg_global.location[param.level + 1]);
 
         // create coarse residual vector if not already created in verify()
         if (!r_coarse)
-          r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                              param.mg_global.location[param.level + 1]);
+          r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                             param.mg_global.location[param.level + 1]);
 
         // create coarse solution vector if not already created in verify()
         if (!x_coarse)
-          x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                              param.mg_global.location[param.level + 1]);
+          x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                             param.mg_global.location[param.level + 1]);
 
-        B_coarse = new std::vector<ColorSpinorField*>();
         int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]);
-        B_coarse->resize(nVec_coarse);
+        B_coarse.resize(nVec_coarse);
 
         // only have single precision B vectors on the coarse grid
         QudaPrecision B_coarse_precision = std::max(param.mg_global.precision_null[param.level+1], QUDA_SINGLE_PRECISION);
         for (int i=0; i<nVec_coarse; i++)
-          (*B_coarse)[i] = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, B_coarse_precision, param.mg_global.setup_location[param.level+1]);
+          B_coarse[i] = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec,
+                                                 B_coarse_precision, param.mg_global.setup_location[param.level + 1]);
 
         // if we're not generating on all levels then we need to propagate the vectors down
         if ((param.level != 0 || param.Nlevel - 1) && param.mg_global.generate_all_levels == QUDA_BOOLEAN_FALSE) {
-          if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n");
-          for (int i=0; i<param.Nvec; i++) {
-            zero(*(*B_coarse)[i]);
-            transfer->R(*(*B_coarse)[i], *(param.B[i]));
+          logQuda(QUDA_VERBOSE, "Restricting null space vectors\n");
+          for (int i = 0; i < param.Nvec; i++) {
+            zero(B_coarse[i]);
+            transfer->R(B_coarse[i], param.B[i]);
           }
         }
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer operator done\n");
+        logQuda(QUDA_VERBOSE, "Transfer operator done\n");
       }
 
       // we no longer need the B fields for this level, can evict them to host memory
       // (only if using managed memory and prefetching is enabled, otherwise no-op)
-      for (int i = 0; i < param.Nvec; i++) { param.B[i]->prefetch(QUDA_CPU_FIELD_LOCATION); }
+      for (int i = 0; i < param.Nvec; i++) { param.B[i].prefetch(QUDA_CPU_FIELD_LOCATION); }
 
       createCoarseDirac();
     }
@@ -219,8 +198,8 @@ namespace quda
         coarse->reset(refresh);
       } else {
         // create the next multigrid level
-        param_coarse = new MGParam(param, *B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy,
-                                   param.level + 1);
+        param_coarse
+          = new MGParam(param, B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy, param.level + 1);
         param_coarse->fine = this;
         param_coarse->delta = 1e-20;
         param_coarse->precision = param.mg_global.invert_param->cuda_prec_precondition;
@@ -238,7 +217,7 @@ namespace quda
     diracSmoother->prefetch(QUDA_CUDA_FIELD_LOCATION);
     diracSmootherSloppy->prefetch(QUDA_CUDA_FIELD_LOCATION);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Setup of level %d done\n", param.level);
+    logQuda(QUDA_VERBOSE, "Setup of level %d done\n", param.level);
 
     popLevel();
   }
@@ -321,7 +300,7 @@ namespace quda
     pushLevel(param.level);
 
     // create the smoother for this level
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating smoother\n");
+    logQuda(QUDA_VERBOSE, "Creating smoother\n");
     destroySmoother();
     param_presmooth = new SolverParam(param);
 
@@ -381,7 +360,7 @@ namespace quda
                        *param.matSmoothSloppy) :
         nullptr;
     }
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Smoother done\n");
+    logQuda(QUDA_VERBOSE, "Smoother done\n");
 
     popLevel();
   }
@@ -389,7 +368,7 @@ namespace quda
   void MG::createCoarseDirac() {
     pushLevel(param.level);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating coarse Dirac operator\n");
+    logQuda(QUDA_VERBOSE, "Creating coarse Dirac operator\n");
 
     // check if we are coarsening the preconditioned system then
     bool preconditioned_coarsen = (param.coarse_grid_solution_type == QUDA_MATPC_SOLUTION && param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE);
@@ -415,7 +394,7 @@ namespace quda
 
       // Parameters that matter for coarse construction and application
       diracParam.dirac = preconditioned_coarsen ? const_cast<Dirac *>(diracSmoother) : const_cast<Dirac *>(diracResidual);
-      diracParam.kappa = (param.B[0]->Nspin() == 1) ?
+      diracParam.kappa = (param.B[0].Nspin() == 1) ?
         -1.0 :
         diracParam.dirac->Kappa(); // -1 cancels automatic kappa in application of Y fields
       diracParam.mass = diracParam.dirac->Mass();
@@ -475,7 +454,7 @@ namespace quda
     matCoarseSmoother = new DiracM(*diracCoarseSmoother);
     matCoarseSmootherSloppy = new DiracM(*diracCoarseSmootherSloppy);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Coarse Dirac operator done\n");
+    logQuda(QUDA_VERBOSE, "Coarse Dirac operator done\n");
 
     popLevel();
   }
@@ -596,7 +575,7 @@ namespace quda
         int defl_size = coarse_solver_inner.deflationSpaceSize();
         if (defl_size > 0 && transfer && param.mg_global.preserve_deflation) {
           // Deflation space exists and we are going to create a new solver. Extract deflation space.
-          if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Extracting deflation space size %d to MG\n", defl_size);
+          logQuda(QUDA_VERBOSE, "Extracting deflation space size %d to MG\n", defl_size);
           coarse_solver_inner.extractDeflationSpace(evecs);
         }
         delete coarse_solver;
@@ -616,12 +595,12 @@ namespace quda
   void MG::createCoarseSolver() {
     pushLevel(param.level);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating coarse solver wrapper\n");
+    logQuda(QUDA_VERBOSE, "Creating coarse solver wrapper\n");
     destroyCoarseSolver();
     if (param.cycle_type == QUDA_MG_CYCLE_VCYCLE && param.level < param.Nlevel-2) {
       // if coarse solver is not a bottom solver and on the second to bottom level then we can just use the coarse solver as is
       coarse_solver = coarse;
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Assigned coarse solver to coarse MG operator\n");
+      logQuda(QUDA_VERBOSE, "Assigned coarse solver to coarse MG operator\n");
     } else if (param.cycle_type == QUDA_MG_CYCLE_RECURSIVE || param.level == param.Nlevel-2) {
 
       param_coarse_solver = new SolverParam(param);
@@ -730,8 +709,7 @@ namespace quda
           // vectors stored in the parent MG instead
           coarse_solver_inner.setDeflateCompute(false);
           coarse_solver_inner.setRecomputeEvals(true);
-          if (getVerbosity() >= QUDA_VERBOSE)
-            printfQuda("Transferring deflation space size %d to coarse solver\n", defl_size);
+          logQuda(QUDA_VERBOSE, "Transferring deflation space size %d to coarse solver\n", defl_size);
           // Create space in coarse solver to hold deflation space, destroy space in MG.
           coarse_solver_inner.injectDeflationSpace(evecs);
         }
@@ -745,11 +723,11 @@ namespace quda
         param_coarse_solver->maxiter = param.mg_global.coarse_solver_maxiter[param.level + 1];
       }
 
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Assigned coarse solver to preconditioned GCR solver\n");
+      logQuda(QUDA_VERBOSE, "Assigned coarse solver to preconditioned GCR solver\n");
     } else {
       errorQuda("Multigrid cycle type %d not supported", param.cycle_type);
     }
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Coarse solver wrapper done\n");
+    logQuda(QUDA_VERBOSE, "Coarse solver wrapper done\n");
 
     popLevel();
   }
@@ -765,12 +743,6 @@ namespace quda
 	if (param_coarse_solver) delete param_coarse_solver;
       }
 
-      if (B_coarse) {
-        int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]);
-        for (int i = 0; i < nVec_coarse; i++)
-          if ((*B_coarse)[i]) delete (*B_coarse)[i];
-        delete B_coarse;
-      }
       if (transfer) delete transfer;
       if (matCoarseSmootherSloppy) delete matCoarseSmootherSloppy;
       if (diracCoarseSmootherSloppy) delete diracCoarseSmootherSloppy;
@@ -841,7 +813,7 @@ namespace quda
 
       for (int i = 0; i < param.Nvec; i++) {
         // as well as copying to the correct location this also changes basis if necessary
-        tmp1 = *param.B[i];
+        tmp1 = param.B[i];
 
         transfer->R(*r_coarse, tmp1);
         transfer->P(tmp2, *r_coarse);
@@ -867,14 +839,14 @@ namespace quda
         logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for %d vectors\n", param.Nvec);
 
         for (int i = 0; i < param.Nvec; i++) {
-          transfer->R(*r_coarse, *(param.B[i]));
+          transfer->R(*r_coarse, param.B[i]);
           (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
           setOutputPrefix(prefix);                // restore prefix after return from coarse grid
           transfer->P(tmp2, *x_coarse);
           (*param.matResidual)(tmp1, tmp2);
-          tmp2 = *(param.B[i]);
-          logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(*param.B[i]), norm2(tmp1));
-          logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(*param.B[i])));
+          tmp2 = param.B[i];
+          logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1));
+          logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i])));
         }
         sprintf(prefix, "MG level %d (%s): ", param.level + 1,
                 param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
@@ -887,14 +859,14 @@ namespace quda
             param.Nvec);
 
     for (int i=0; i<param.Nvec; i++) {
-      transfer->R(*r_coarse, *(param.B[i]));
+      transfer->R(*r_coarse, param.B[i]);
       (*coarse)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
       setOutputPrefix(prefix); // restore output prefix
       transfer->P(tmp2, *x_coarse);
       param.matResidual(tmp1, tmp2);
-      tmp2 = *(param.B[i]);
-      logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(*param.B[i]), norm2(tmp1));
-      logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(*param.B[i])) );
+      tmp2 = param.B[i];
+      logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(param.B[i]), norm2(tmp1));
+      logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i])) );
     }
 #endif
 
@@ -902,18 +874,18 @@ namespace quda
     // Otherwise these get created in the coarse `reset()` routine later
 
     if (!tmp_coarse)
-      tmp_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                            param.mg_global.location[param.level + 1]);
+      tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                           param.mg_global.location[param.level + 1]);
 
     // create coarse residual vector if not already created in verify()
     if (!r_coarse)
-      r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                          param.mg_global.location[param.level + 1]);
+      r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                         param.mg_global.location[param.level + 1]);
 
     // create coarse solution vector if not already created in verify()
     if (!x_coarse)
-      x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                          param.mg_global.location[param.level + 1]);
+      x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
+                                         param.mg_global.location[param.level + 1]);
 
     {
       logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P^\\dagger P) eta_c\n");
@@ -1135,16 +1107,16 @@ namespace quda
         for (int i = 0; i < param.Nvec; i++) {
 
           // Restrict Evec, place result in r_coarse
-          transfer->R(*r_coarse, *param.B[i]);
+          transfer->R(*r_coarse, param.B[i]);
           // Prolong r_coarse, place result in tmp2
           transfer->P(tmp2, *r_coarse);
 
-          printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(*param.B[i]),
+          printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(param.B[i]),
                      norm2(*r_coarse), norm2(tmp2));
 
           // Compare v_k and PP^dag v_k.
-          auto max_deviation = blas::max_deviation(tmp2, *param.B[i]);
-          auto l2_deviation = sqrt(xmyNorm(*param.B[i], tmp2) / norm2(*param.B[i]));
+          auto max_deviation = blas::max_deviation(tmp2, param.B[i]);
+          auto l2_deviation = sqrt(xmyNorm(param.B[i], tmp2) / norm2(param.B[i]));
           printfQuda("L2 relative deviation = %e max deviation = %e\n", l2_deviation, max_deviation[0]);
 
           if (param.mg_global.run_oblique_proj_check) {
@@ -1156,17 +1128,17 @@ namespace quda
             // Oblique projections
             logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for vector %d\n", i);
 
-            transfer->R(*r_coarse, *param.B[i]);
+            transfer->R(*r_coarse, param.B[i]);
             (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
             setOutputPrefix(prefix);                // restore prefix after return from coarse grid
             transfer->P(tmp2, *x_coarse);
             (*param.matResidual)(tmp1, tmp2);
 
-            logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(*param.B[i]),
+            logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(param.B[i]),
                     norm2(tmp1));
-            max_deviation = blas::max_deviation(tmp1, *param.B[i]);
+            max_deviation = blas::max_deviation(tmp1, param.B[i]);
             logQuda(QUDA_SUMMARIZE, "L2 relative deviation = %e, max deviation = %e\n",
-                    sqrt(xmyNorm(*param.B[i], tmp1) / norm2(*param.B[i])), max_deviation[0]);
+                    sqrt(xmyNorm(param.B[i], tmp1) / norm2(param.B[i])), max_deviation[0]);
           }
 
           sprintf(prefix, "MG level %d (%s): ", param.level + 1,
@@ -1314,7 +1286,7 @@ namespace quda
   }
 
   // supports separate reading or single file read
-  void MG::loadVectors(std::vector<ColorSpinorField *> &B)
+  void MG::loadVectors(cvector_ref<ColorSpinorField> &B)
   {
     if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) {
       warningQuda("Cannot load near-null vectors for top level of staggered MG solve.");
@@ -1327,15 +1299,13 @@ namespace quda
       vec_infile += "_nvec_";
       vec_infile += std::to_string(param.mg_global.n_vec[param.level]);
       VectorIO io(vec_infile);
-      vector_ref<ColorSpinorField> B_ref;
-      for (auto i = 0u; i < B.size(); i++) B_ref.push_back(*B[i]);
-      io.load(std::move(B_ref));
+      io.load(B);
       popLevel();
       getProfile().TPSTOP(QUDA_PROFILE_IO);
     }
   }
 
-  void MG::saveVectors(const std::vector<ColorSpinorField *> &B) const
+  void MG::saveVectors(cvector_ref<const ColorSpinorField> &B) const
   {
     if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) {
       warningQuda("Cannot save near-null vectors for top level of staggered MG solve.");
@@ -1348,9 +1318,7 @@ namespace quda
       vec_outfile += "_nvec_";
       vec_outfile += std::to_string(param.mg_global.n_vec[param.level]);
       VectorIO io(vec_outfile, false, param.mg_global.mg_vec_partfile[param.level]);
-      vector_ref<const ColorSpinorField> B_ref;
-      for (auto i = 0u; i < B.size(); i++) B_ref.push_back(*B[i]);
-      io.save(std::move(B_ref));
+      io.save(B);
       popLevel();
       getProfile().TPSTOP(QUDA_PROFILE_IO);
     }
@@ -1366,7 +1334,7 @@ namespace quda
     if (param.level < param.Nlevel - 2) coarse->dumpNullVectors();
   }
 
-  void MG::generateNullVectors(std::vector<ColorSpinorField *> &B, bool refresh)
+  void MG::generateNullVectors(std::vector<ColorSpinorField> &B, bool refresh)
   {
     pushLevel(param.level);
 
@@ -1402,11 +1370,11 @@ namespace quda
 
     solverParam.residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL);
     solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES;
-    ColorSpinorParam csParam(*B[0]);                            // Create spinor field parameters:
+    ColorSpinorParam csParam(B[0]);                             // Create spinor field parameters:
     csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering
     csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now
-    csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS :
-                                              QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered
+    csParam.gammaBasis = B[0].Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS :
+                                             QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered
     csParam.create = QUDA_ZERO_FIELD_CREATE;
     ColorSpinorField b(csParam);
     ColorSpinorField x(csParam);
@@ -1417,7 +1385,7 @@ namespace quda
     bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER
       && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ;
     if (schwarz_reset) {
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding");
+      logQuda(QUDA_VERBOSE, "Disabling Schwarz for null-space finding");
       int commDim[QUDA_MAX_DIM];
       for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1;
       diracSmootherSloppy->setCommDim(commDim);
@@ -1458,54 +1426,55 @@ namespace quda
     }
 
     for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) {
-      if (getVerbosity() >= QUDA_VERBOSE)
-        printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1,
-                   param.mg_global.num_setup_iter[param.level]);
+      logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1,
+              param.mg_global.num_setup_iter[param.level]);
 
       // global orthonormalization of the initial null-space vectors
-      if(param.mg_global.pre_orthonormalize) {
-        for(int i=0; i<(int)B.size(); i++) {
-          for (int j=0; j<i; j++) {
-            Complex alpha = cDotProduct(*B[j], *B[i]);// <j,i>
-            caxpy(-alpha, *B[j], *B[i]); // i-<j,i>j
+      if (param.mg_global.pre_orthonormalize) {
+        for (auto i = 0u; i < B.size(); i++) {
+          for (auto j = 0u; j < i; j++) {
+            Complex alpha = cDotProduct(B[j], B[i]); // <j,i>
+            caxpy(-alpha, B[j], B[i]);               // i-<j,i>j
           }
-          double nrm2 = norm2(*B[i]);
-          if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/<i,i>
+          double nrm2 = norm2(B[i]);
+          if (nrm2 > 1e-16)
+            ax(1.0 / sqrt(nrm2), B[i]); // i/<i,i>
           else errorQuda("\nCannot normalize %u vector\n", i);
         }
       }
 
       // launch solver for each source
-      for (int i=0; i<(int)B.size(); i++) {
+      for (auto i = 0u; i < B.size(); i++) {
         if (param.mg_global.setup_type == QUDA_TEST_VECTOR_SETUP) { // DDalphaAMG test vector idea
-          b = *B[i];                                                // inverting against the vector
+          b = B[i];                                                 // inverting against the vector
           zero(x);                                                  // with zero initial guess
         } else {
-          x = *B[i];
+          x = B[i];
           zero(b);
         }
 
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x));
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b));
+        logQuda(QUDA_VERBOSE, "Initial guess = %g\n", norm2(x));
+        logQuda(QUDA_VERBOSE, "Initial rhs = %g\n", norm2(b));
 
         ColorSpinorField *out=nullptr, *in=nullptr;
         diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION);
         (*solve)(*out, *in);
         diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION);
 
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x));
-        *B[i] = x;
+        logQuda(QUDA_VERBOSE, "Solution = %g\n", norm2(x));
+        B[i] = x;
       }
 
       // global orthonormalization of the generated null-space vectors
       if (param.mg_global.post_orthonormalize) {
-        for(int i=0; i<(int)B.size(); i++) {
-          for (int j=0; j<i; j++) {
-            Complex alpha = cDotProduct(*B[j], *B[i]);// <j,i>
-            caxpy(-alpha, *B[j], *B[i]); // i-<j,i>j
+        for (auto i = 0u; i < B.size(); i++) {
+          for (auto j = 0u; j < i; j++) {
+            Complex alpha = cDotProduct(B[j], B[i]); // <j,i>
+            caxpy(-alpha, B[j], B[i]);               // i-<j,i>j
           }
-          double nrm2 = norm2(*B[i]);
-          if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/<i,i>
+          double nrm2 = norm2(B[i]);
+          if (sqrt(nrm2) > 1e-16)
+            ax(1.0 / sqrt(nrm2), B[i]); // i/<i,i>
           else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2));
         }
       }
@@ -1517,12 +1486,12 @@ namespace quda
           reset();
           if ( param.level < param.Nlevel-2 ) {
             if ( param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE ) {
-              coarse->generateNullVectors(*B_coarse, refresh);
+              coarse->generateNullVectors(B_coarse, refresh);
             } else {
-              if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n");
-              for (int i=0; i<param.Nvec; i++) {
-                zero(*(*B_coarse)[i]);
-                transfer->R(*(*B_coarse)[i], *(param.B[i]));
+              logQuda(QUDA_VERBOSE, "Restricting null space vectors\n");
+              for (auto i = 0; i < param.Nvec; i++) {
+                zero(B_coarse[i]);
+                transfer->R(B_coarse[i], param.B[i]);
               }
               // rebuild the transfer operator in the coarse level
               coarse->resetTransfer = true;
@@ -1543,7 +1512,7 @@ namespace quda
 
     // reenable Schwarz
     if (schwarz_reset) {
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding");
+      logQuda(QUDA_VERBOSE, "Reenabling Schwarz for null-space finding\n");
       int commDim[QUDA_MAX_DIM];
       for (int i=0; i<QUDA_MAX_DIM; i++) commDim[i] = 0;
       diracSmootherSloppy->setCommDim(commDim);
@@ -1558,15 +1527,15 @@ namespace quda
 
   // generate a full span of free vectors.
   // FIXME: Assumes fine level is SU(3).
-  void MG::buildFreeVectors(std::vector<ColorSpinorField *> &B)
+  void MG::buildFreeVectors(std::vector<ColorSpinorField> &B)
   {
     pushLevel(param.level);
     const int Nvec = B.size();
 
     // Given the number of colors and spins, figure out if the number
     // of vectors in 'B' makes sense.
-    const int Ncolor = B[0]->Ncolor();
-    const int Nspin = B[0]->Nspin();
+    const int Ncolor = B[0].Ncolor();
+    const int Nspin = B[0].Nspin();
 
     if (Ncolor == 3) // fine level
     {
@@ -1575,14 +1544,13 @@ namespace quda
         // There needs to be 6 null vectors -> 12 after chirality.
         if (Nvec != 6) errorQuda("\nError in MG::buildFreeVectors: Wilson-type fermions require Nvec = 6");
 
-        if (getVerbosity() >= QUDA_VERBOSE)
-          printfQuda("Building %d free field vectors for Wilson-type fermions\n", Nvec);
+        logQuda(QUDA_VERBOSE, "Building %d free field vectors for Wilson-type fermions\n", Nvec);
 
         // Zero the null vectors.
-        for (int i = 0; i < Nvec; i++) zero(*B[i]);
+        for (int i = 0; i < Nvec; i++) zero(B[i]);
 
         // Create a temporary vector.
-        ColorSpinorParam csParam(*B[0]);
+        ColorSpinorParam csParam(B[0]);
         csParam.create = QUDA_ZERO_FIELD_CREATE;
         ColorSpinorField tmp(csParam);
 
@@ -1590,9 +1558,9 @@ namespace quda
         for (int c = 0; c < Ncolor; c++) {
           for (int s = 0; s < 2; s++) {
             tmp.Source(QUDA_CONSTANT_SOURCE, 1, s, c);
-            xpy(tmp, *B[counter]);
+            xpy(tmp, B[counter]);
             tmp.Source(QUDA_CONSTANT_SOURCE, 1, s + 2, c);
-            xpy(tmp, *B[counter]);
+            xpy(tmp, B[counter]);
             counter++;
           }
         }
@@ -1602,68 +1570,67 @@ namespace quda
         // There needs to be 24 null vectors -> 48 after chirality.
         if (Nvec != 24) errorQuda("\nError in MG::buildFreeVectors: Staggered-type fermions require Nvec = 24\n");
 
-        if (getVerbosity() >= QUDA_VERBOSE)
-          printfQuda("Building %d free field vectors for Staggered-type fermions\n", Nvec);
+        logQuda(QUDA_VERBOSE, "Building %d free field vectors for Staggered-type fermions\n", Nvec);
 
         // Zero the null vectors.
-        for (int i = 0; i < Nvec; i++) zero(*B[i]);
+        for (int i = 0; i < Nvec; i++) zero(B[i]);
 
         // Create a temporary vector.
-        ColorSpinorParam csParam(*B[0]);
+        ColorSpinorParam csParam(B[0]);
         csParam.create = QUDA_ZERO_FIELD_CREATE;
         ColorSpinorField tmp(csParam);
 
         // Build free null vectors.
-        for (int c = 0; c < B[0]->Ncolor(); c++) {
+        for (int c = 0; c < B[0].Ncolor(); c++) {
           // Need to pair an even+odd corner together
           // since they'll get split up.
           // 0000, 0001
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x0, c);
-          xpy(tmp, *B[8 * c + 0]);
+          xpy(tmp, B[8 * c + 0]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x1, c);
-          xpy(tmp, *B[8 * c + 0]);
+          xpy(tmp, B[8 * c + 0]);
 
           // 0010, 0011
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x2, c);
-          xpy(tmp, *B[8 * c + 1]);
+          xpy(tmp, B[8 * c + 1]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x3, c);
-          xpy(tmp, *B[8 * c + 1]);
+          xpy(tmp, B[8 * c + 1]);
 
           // 0100, 0101
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x4, c);
-          xpy(tmp, *B[8 * c + 2]);
+          xpy(tmp, B[8 * c + 2]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x5, c);
-          xpy(tmp, *B[8 * c + 2]);
+          xpy(tmp, B[8 * c + 2]);
 
           // 0110, 0111
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x6, c);
-          xpy(tmp, *B[8 * c + 3]);
+          xpy(tmp, B[8 * c + 3]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x7, c);
-          xpy(tmp, *B[8 * c + 3]);
+          xpy(tmp, B[8 * c + 3]);
 
           // 1000, 1001
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x8, c);
-          xpy(tmp, *B[8 * c + 4]);
+          xpy(tmp, B[8 * c + 4]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0x9, c);
-          xpy(tmp, *B[8 * c + 4]);
+          xpy(tmp, B[8 * c + 4]);
 
           // 1010, 1011
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xA, c);
-          xpy(tmp, *B[8 * c + 5]);
+          xpy(tmp, B[8 * c + 5]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xB, c);
-          xpy(tmp, *B[8 * c + 5]);
+          xpy(tmp, B[8 * c + 5]);
 
           // 1100, 1101
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xC, c);
-          xpy(tmp, *B[8 * c + 6]);
+          xpy(tmp, B[8 * c + 6]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xD, c);
-          xpy(tmp, *B[8 * c + 6]);
+          xpy(tmp, B[8 * c + 6]);
 
           // 1110, 1111
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xE, c);
-          xpy(tmp, *B[8 * c + 7]);
+          xpy(tmp, B[8 * c + 7]);
           tmp.Source(QUDA_CORNER_SOURCE, 1, 0xF, c);
-          xpy(tmp, *B[8 * c + 7]);
+          xpy(tmp, B[8 * c + 7]);
         }
 
       } else {
@@ -1674,40 +1641,40 @@ namespace quda
         // There needs to be Ncolor null vectors.
         if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor");
 
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor);
+        logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor);
 
         // Zero the null vectors.
-        for (int i = 0; i < Nvec; i++) zero(*B[i]);
+        for (int i = 0; i < Nvec; i++) zero(B[i]);
 
         // Create a temporary vector.
-        ColorSpinorParam csParam(*B[0]);
+        ColorSpinorParam csParam(B[0]);
         csParam.create = QUDA_ZERO_FIELD_CREATE;
         ColorSpinorField tmp(csParam);
 
         for (int c = 0; c < Ncolor; c++) {
           tmp.Source(QUDA_CONSTANT_SOURCE, 1, 0, c);
-          xpy(tmp, *B[c]);
+          xpy(tmp, B[c]);
           tmp.Source(QUDA_CONSTANT_SOURCE, 1, 1, c);
-          xpy(tmp, *B[c]);
+          xpy(tmp, B[c]);
         }
 
       } else if (Nspin == 1) {
         // There needs to be Ncolor null vectors.
         if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor");
 
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor);
+        logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor);
 
         // Zero the null vectors.
-        for (int i = 0; i < Nvec; i++) zero(*B[i]);
+        for (int i = 0; i < Nvec; i++) zero(B[i]);
 
         // Create a temporary vector.
-        ColorSpinorParam csParam(*B[0]);
+        ColorSpinorParam csParam(B[0]);
         csParam.create = QUDA_ZERO_FIELD_CREATE;
         ColorSpinorField tmp(csParam);
 
         for (int c = 0; c < Ncolor; c++) {
           tmp.Source(QUDA_CONSTANT_SOURCE, 1, 0, c);
-          xpy(tmp, *B[c]);
+          xpy(tmp, B[c]);
         }
 
       } else {
@@ -1717,14 +1684,15 @@ namespace quda
 
     // global orthonormalization of the generated null-space vectors
     if(param.mg_global.post_orthonormalize) {
-      for(int i=0; i<(int)B.size(); i++) {
-        double nrm2 = norm2(*B[i]);
-        if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/<i,i>
+      for (auto i = 0u; i < B.size(); i++) {
+        double nrm2 = norm2(B[i]);
+        if (nrm2 > 1e-16)
+          ax(1.0 / sqrt(nrm2), B[i]); // i/<i,i>
         else errorQuda("\nCannot normalize %u vector\n", i);
       }
     }
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Done building free vectors\n");
+    logQuda(QUDA_VERBOSE, "Done building free vectors\n");
 
     popLevel();
   }
@@ -1739,7 +1707,7 @@ namespace quda
     bool normop = param.mg_global.eig_param[param.level]->use_norm_op;
 
     // Dummy array to keep the eigensolver happy.
-    ColorSpinorParam csParam(*param.B[0]);
+    ColorSpinorParam csParam(param.B[0]);
     csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     csParam.create = QUDA_ZERO_FIELD_CREATE;
     // This is the vector precision used by matResidual
@@ -1750,8 +1718,8 @@ namespace quda
     for (auto &b : B_evecs) b = ColorSpinorField(csParam);
 
     // before entering the eigen solver, let's free the B vectors to save some memory
-    ColorSpinorParam bParam(*param.B[0]);
-    for (int i = 0; i < (int)param.B.size(); i++) delete param.B[i];
+    ColorSpinorParam bParam(param.B[0]);
+    for (auto &b : param.B) b = ColorSpinorField();
 
     EigenSolver *eig_solve;
     if (!normop && !dagger) {
@@ -1781,9 +1749,10 @@ namespace quda
     }
 
     // now reallocate the B vectors copy in e-vectors
-    for (int i = 0; i < (int)param.B.size(); i++) {
-      param.B[i] = new ColorSpinorField(bParam);
-      *param.B[i] = B_evecs[i]; // FIXME can std::move this
+    bParam.create = QUDA_NULL_FIELD_CREATE;
+    for (auto i = 0u; i < param.B.size(); i++) {
+      param.B[i] = ColorSpinorField(bParam);
+      param.B[i] = B_evecs[i];
     }
 
     // only save if outfile is defined
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index 895055219c..d67c4c8228 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -19,7 +19,7 @@ namespace quda {
   * for the staggered case, there is no spin blocking, 
   * however we do even-odd to preserve chirality (that is straightforward)
   */
-  Transfer::Transfer(const std::vector<ColorSpinorField *> &B, int Nvec, int n_block_ortho, bool block_ortho_two_pass,
+  Transfer::Transfer(const std::vector<ColorSpinorField> &B, int Nvec, int n_block_ortho, bool block_ortho_two_pass,
                      int *geo_bs, int spin_bs, QudaPrecision null_precision, const QudaTransferType transfer_type) :
     B(B),
     Nvec(Nvec),
@@ -28,26 +28,26 @@ namespace quda {
     null_precision(null_precision),
     spin_bs(spin_bs),
     spin_map(0),
-    nspin_fine(B[0]->Nspin()),
+    nspin_fine(B[0].Nspin()),
     site_subset(QUDA_FULL_SITE_SUBSET),
     parity(QUDA_INVALID_PARITY),
     use_gpu(true),
     transfer_type(transfer_type)
   {
     postTrace();
-    int ndim = B[0]->Ndim();
+    int ndim = B[0].Ndim();
 
     // Only loop over four dimensions for now, we don't have
     // to worry about the fifth dimension until we hit chiral fermions.
     for (int d = 0; d < 4; d++) {
       while (geo_bs[d] > 0) {
-      	if (d==0 && B[0]->X(0) == geo_bs[0])
-      	  warningQuda("X-dimension length %d cannot block length %d", B[0]->X(0), geo_bs[0]);
-      	else if ( (B[0]->X(d)/geo_bs[d]+1)%2 == 0)
-      	  warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0]->X(d)/geo_bs[d]);
-      	else if ( (B[0]->X(d)/geo_bs[d]) * geo_bs[d] != B[0]->X(d) )
-      	  warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0]->X(d), geo_bs[d]);
-      	else
+        if (d == 0 && B[0].X(0) == geo_bs[0])
+          warningQuda("X-dimension length %d cannot block length %d", B[0].X(0), geo_bs[0]);
+        else if ((B[0].X(d) / geo_bs[d] + 1) % 2 == 0)
+          warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0].X(d) / geo_bs[d]);
+        else if ((B[0].X(d) / geo_bs[d]) * geo_bs[d] != B[0].X(d))
+          warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0].X(d), geo_bs[d]);
+        else
       	  break; // this is a valid block size so let's use it
       	geo_bs[d] /= 2;
       }
@@ -71,11 +71,11 @@ namespace quda {
         errorQuda("Invalid total geometric block size %d for transfer type optimized-kd, must be 1", total_block_size);
 
       // The number of coarse dof is technically fineColor for optimized KD
-      if (Nvec != B[0]->Ncolor())
-        errorQuda("Invalid Nvec %d for optimized-kd aggregation, must be fine color %d", Nvec, B[0]->Ncolor());
+      if (Nvec != B[0].Ncolor())
+        errorQuda("Invalid Nvec %d for optimized-kd aggregation, must be fine color %d", Nvec, B[0].Ncolor());
 
     } else {
-      int aggregate_size = total_block_size * B[0]->Ncolor();
+      int aggregate_size = total_block_size * B[0].Ncolor();
       if (spin_bs == 0)
         aggregate_size /= 2; // effective spin_bs of 0.5 (fine spin / coarse spin)
       else
@@ -95,23 +95,23 @@ namespace quda {
       if (Nvec != 24) errorQuda("Invalid number of coarse vectors %d for staggered KD multigrid, must be 24", Nvec);
     }
 
-    createV(B[0]->Location()); // allocate V field
+    createV(B[0].Location());              // allocate V field
     createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation)
 
     // allocate and compute the fine-to-coarse and coarse-to-fine site maps
-    fine_to_coarse_h = static_cast<int*>(pool_pinned_malloc(B[0]->Volume()*sizeof(int)));
-    coarse_to_fine_h = static_cast<int*>(pool_pinned_malloc(B[0]->Volume()*sizeof(int)));
+    fine_to_coarse_h = static_cast<int *>(pool_pinned_malloc(B[0].Volume() * sizeof(int)));
+    coarse_to_fine_h = static_cast<int *>(pool_pinned_malloc(B[0].Volume() * sizeof(int)));
 
     if (enable_gpu) {
-      fine_to_coarse_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
-      coarse_to_fine_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
+      fine_to_coarse_d = static_cast<int *>(pool_device_malloc(B[0].Volume() * sizeof(int)));
+      coarse_to_fine_d = static_cast<int *>(pool_device_malloc(B[0].Volume() * sizeof(int)));
     }
 
     createGeoMap(geo_bs);
 
     // allocate the fine-to-coarse spin map
     spin_map = static_cast<int**>(safe_malloc(nspin_fine*sizeof(int*)));
-    for (int s = 0; s < B[0]->Nspin(); s++) spin_map[s] = static_cast<int*>(safe_malloc(2*sizeof(int)));
+    for (int s = 0; s < B[0].Nspin(); s++) spin_map[s] = static_cast<int *>(safe_malloc(2 * sizeof(int)));
     createSpinMap(spin_bs);
 
     reset();
@@ -123,13 +123,13 @@ namespace quda {
     postTrace();
 
     // create the storage for the final block orthogonal elements
-    ColorSpinorParam param(*B[0]); // takes the geometry from the null-space vectors
+    ColorSpinorParam param(B[0]); // takes the geometry from the null-space vectors
 
     // the ordering of the V vector is defined by these parameters and
     // the Packed functions in ColorSpinorFieldOrder
 
-    param.nSpin = B[0]->Nspin(); // spin has direct mapping
-    param.nColor = B[0]->Ncolor()*Nvec; // nColor = number of colors * number of vectors
+    param.nSpin = B[0].Nspin();          // spin has direct mapping
+    param.nColor = B[0].Ncolor() * Nvec; // nColor = number of colors * number of vectors
     param.nVec = Nvec;
     param.create = QUDA_NULL_FIELD_CREATE;
     // the V field is defined on all sites regardless of B field (maybe the B fields are always full?)
@@ -141,7 +141,7 @@ namespace quda {
     param.location = location;
     param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) :
                                                               QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-    param.setPrecision(location == QUDA_CUDA_FIELD_LOCATION ? null_precision : B[0]->Precision());
+    param.setPrecision(location == QUDA_CUDA_FIELD_LOCATION ? null_precision : B[0].Precision());
 
     if (transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD
         || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) {
@@ -173,7 +173,7 @@ namespace quda {
     if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return;
 
     postTrace();
-    ColorSpinorParam param(*B[0]);
+    ColorSpinorParam param(B[0]);
     param.create = QUDA_NULL_FIELD_CREATE;
     param.location = location;
     param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) :
@@ -197,7 +197,7 @@ namespace quda {
     if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized");
 
     // delayed allocating this temporary until we need it
-    //if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src);
+    // if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src);
     createTmp(location, n_src);
 
     switch (location) {
@@ -205,10 +205,10 @@ namespace quda {
       if (enable_gpu) return;
       createV(location);
       if (transfer_type == QUDA_TRANSFER_AGGREGATE) *V_d = *V_h;
-      fine_to_coarse_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
-      coarse_to_fine_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
-      qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice);
-      qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice);
+      fine_to_coarse_d = static_cast<int *>(pool_device_malloc(B[0].Volume() * sizeof(int)));
+      coarse_to_fine_d = static_cast<int *>(pool_device_malloc(B[0].Volume() * sizeof(int)));
+      qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice);
+      qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice);
       break;
     case QUDA_CPU_FIELD_LOCATION:
       if (enable_cpu) return;
@@ -230,7 +230,7 @@ namespace quda {
     }
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer: block orthogonalizing\n");
 
-    if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) {
+    if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
       if (!enable_gpu) errorQuda("enable_gpu = %d so cannot reset", enable_gpu);
       BlockOrthogonalize(*V_d, B, fine_to_coarse_d, coarse_to_fine_d, geo_bs, spin_bs, NblockOrtho, blockOrthoTwoPass);
       if (enable_cpu) {
@@ -312,14 +312,14 @@ namespace quda {
 
     // now create an inverse-like variant of this
 
-    std::vector<Int2> geo_sort(B[0]->Volume());
+    std::vector<Int2> geo_sort(B[0].Volume());
     for (unsigned int i=0; i<geo_sort.size(); i++) geo_sort[i] = Int2(fine_to_coarse_h[i], i);
     std::sort(geo_sort.begin(), geo_sort.end());
     for (unsigned int i=0; i<geo_sort.size(); i++) coarse_to_fine_h[i] = geo_sort[i].y;
 
     if (enable_gpu) {
-      qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice);
-      qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice);
+      qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice);
+      qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice);
     }
 
   }
@@ -333,7 +333,7 @@ namespace quda {
     }
     else
     {
-      for (int s=0; s<B[0]->Nspin(); s++) {
+      for (int s = 0; s < B[0].Nspin(); s++) {
         spin_map[s][0] = s / spin_bs; // not staggered, doesn't care about parity. 
         spin_map[s][1] = s / spin_bs;
       }

From 12cf478e0e9b573032765593d86db8252b99774e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Jan 2024 09:29:34 -0800
Subject: [PATCH 06/21] Move all ColorSpinorFields in multigrid.cpp to objects

---
 include/color_spinor_field.h |  38 -------
 include/multigrid.h          |  12 +-
 lib/color_spinor_field.cpp   |  83 --------------
 lib/multigrid.cpp            | 206 +++++++++++++++++------------------
 4 files changed, 105 insertions(+), 234 deletions(-)

diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 923678dbaa..d91531b900 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -781,44 +781,6 @@ namespace quda
     */
     ColorSpinorField create_alias(const ColorSpinorParam &param = ColorSpinorParam());
 
-    /**
-       @brief Create a field that aliases this field's storage.  The
-       alias field can use a different precision than this field,
-       though it cannot be greater.  This functionality is useful for
-       the case where we have multiple temporaries in different
-       precisions, but do not need them simultaneously.  Use this functionality with caution.
-       @param[in] param Parameters for the alias field
-    */
-    ColorSpinorField *CreateAlias(const ColorSpinorParam &param);
-
-    /**
-       @brief Create a coarse color-spinor field, using this field to set the meta data
-       @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions
-       @param[in] spinlockSize Geometric block size that defines the coarse spin dimension
-       @param[in] Nvec Number of coarse color degrees of freedom per grid point
-       @param[in] precision Optionally set the precision of the fine field
-       @param[in] location Optionally set the location of the coarse field
-       @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory)
-    */
-    ColorSpinorField *CreateCoarse(const int *geoBlockSize, int spinBlockSize, int Nvec,
-                                   QudaPrecision precision = QUDA_INVALID_PRECISION,
-                                   QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION,
-                                   QudaMemoryType mem_Type = QUDA_MEMORY_INVALID);
-
-    /**
-       @brief Create a fine color-spinor field, using this field to set the meta data
-       @param[in] geoBlockSize Geometric block size that defines the fine grid dimensions
-       @param[in] spinlockSize Geometric block size that defines the fine spin dimension
-       @param[in] Nvec Number of fine color degrees of freedom per grid point
-       @param[in] precision Optionally set the precision of the fine field
-       @param[in] location Optionally set the location of the fine field
-       @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory)
-    */
-    ColorSpinorField *CreateFine(const int *geoblockSize, int spinBlockSize, int Nvec,
-                                 QudaPrecision precision = QUDA_INVALID_PRECISION,
-                                 QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION,
-                                 QudaMemoryType mem_type = QUDA_MEMORY_INVALID);
-
     /**
        @brief Create a coarse color-spinor field, using this field to set the meta data
        @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions
diff --git a/include/multigrid.h b/include/multigrid.h
index 16250e5bdc..a369789dfc 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -295,22 +295,22 @@ namespace quda {
     std::vector<ColorSpinorField> B_coarse;
 
     /** Residual vector */
-    ColorSpinorField *r = nullptr;
+    ColorSpinorField r;
 
     /** Projected source vector for preconditioned system, else just points to source */
-    ColorSpinorField *b_tilde = nullptr;
+    ColorSpinorField b_tilde;
 
     /** Coarse residual vector */
-    ColorSpinorField *r_coarse = nullptr;
+    ColorSpinorField r_coarse;
 
     /** Coarse solution vector */
-    ColorSpinorField *x_coarse = nullptr;
+    ColorSpinorField x_coarse;
 
     /** Coarse temporary vector */
-    ColorSpinorField *tmp_coarse = nullptr;
+    ColorSpinorField tmp_coarse;
 
     /** Sloppy coarse temporary vector */
-    ColorSpinorField *tmp_coarse_sloppy = nullptr;
+    ColorSpinorField tmp_coarse_sloppy;
 
     /** Kahler-Dirac Xinv */
     std::shared_ptr<GaugeField> xInvKD;
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index e8fbfd56d5..9c6c5d0272 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -840,89 +840,6 @@ namespace quda
     return ColorSpinorField(param);
   }
 
-  ColorSpinorField *ColorSpinorField::CreateAlias(const ColorSpinorParam &param_)
-  {
-    if (param_.Precision() > precision)
-      errorQuda("Cannot create an alias to source with lower precision than the alias");
-    ColorSpinorParam param(param_);
-    param.create = QUDA_REFERENCE_FIELD_CREATE;
-    param.v = data();
-
-    return new ColorSpinorField(param);
-  }
-
-  ColorSpinorField *ColorSpinorField::CreateCoarse(const int *geoBlockSize, int spinBlockSize, int Nvec,
-                                                   QudaPrecision new_precision, QudaFieldLocation new_location,
-                                                   QudaMemoryType new_mem_type)
-  {
-    ColorSpinorParam coarseParam(*this);
-    for (int d = 0; d < nDim; d++) coarseParam.x[d] = x[d] / geoBlockSize[d];
-
-    int geoBlockVolume = 1;
-    for (int d = 0; d < nDim; d++) { geoBlockVolume *= geoBlockSize[d]; }
-
-    // Detect if the "coarse" op is the Kahler-Dirac op or something else
-    // that still acts on a fine staggered ColorSpinorField
-    if (geoBlockVolume == 1 && Nvec == nColor && nSpin == 1) {
-      coarseParam.nSpin = nSpin;
-      coarseParam.nColor = nColor;
-    } else {
-      coarseParam.nSpin = (nSpin == 1) ? 2 : (nSpin / spinBlockSize); // coarsening staggered check
-      coarseParam.nColor = Nvec;
-    }
-
-    coarseParam.siteSubset = QUDA_FULL_SITE_SUBSET; // coarse grid is always full
-    coarseParam.create = QUDA_ZERO_FIELD_CREATE;
-
-    // if new precision is not set, use this->precision
-    new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision;
-
-    // if new location is not set, use this->location
-    new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location;
-
-    coarseParam.fieldOrder = (new_location == QUDA_CUDA_FIELD_LOCATION) ?
-      colorspinor::getNative(new_precision, coarseParam.nSpin) :
-      QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-
-    coarseParam.setPrecision(new_precision);
-
-    // set where we allocate the field
-    coarseParam.mem_type = new_mem_type;
-
-    return new ColorSpinorField(coarseParam);
-  }
-
-  ColorSpinorField *ColorSpinorField::CreateFine(const int *geoBlockSize, int spinBlockSize, int Nvec,
-                                                 QudaPrecision new_precision, QudaFieldLocation new_location,
-                                                 QudaMemoryType new_mem_type)
-  {
-    ColorSpinorParam fineParam(*this);
-    for (int d = 0; d < nDim; d++) fineParam.x[d] = x[d] * geoBlockSize[d];
-    fineParam.nSpin = nSpin * spinBlockSize;
-    fineParam.nColor = Nvec;
-    fineParam.siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME fine grid is always full
-    fineParam.create = QUDA_ZERO_FIELD_CREATE;
-
-    // if new precision is not set, use this->precision
-    new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision;
-
-    // if new location is not set, use this->location
-    new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location;
-
-    // for GPU fields, always use native ordering to ensure coalescing
-    if (new_location == QUDA_CUDA_FIELD_LOCATION) {
-      fineParam.setPrecision(new_precision, new_precision, true);
-    } else {
-      fineParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-      fineParam.setPrecision(new_precision);
-    }
-
-    // set where we allocate the field
-    fineParam.mem_type = new_mem_type;
-
-    return new ColorSpinorField(fineParam);
-  }
-
   ColorSpinorField ColorSpinorField::create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec,
                                                    QudaPrecision new_precision, QudaFieldLocation new_location,
                                                    QudaMemoryType new_mem_type)
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index 03247e2356..fac28198d3 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -46,13 +46,13 @@ namespace quda
       }
       if (param.B[0].Nspin() == 1)
         csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks
-      r = new ColorSpinorField(csParam);
+      r = ColorSpinorField(csParam);
 
       // if we're using preconditioning then allocate storage for the preconditioned source vector
       if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) {
       	csParam.x[0] /= 2;
       	csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-        b_tilde = new ColorSpinorField(csParam);
+        b_tilde = ColorSpinorField(csParam);
       }
     }
 
@@ -65,8 +65,8 @@ namespace quda
 
             // Initializing to random vectors
             for (int i = 0; i < (int)param.B.size(); i++) {
-              spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM);
-              param.B[i] = *r;
+              spinorNoise(r, *rng, QUDA_NOISE_UNIFORM);
+              param.B[i] = r;
             }
           }
           if (param.mg_global.num_setup_iter[param.level] > 0) {
@@ -139,19 +139,19 @@ namespace quda
           param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i];
 
         // create coarse temporary vector if not already created in verify()
-        if (!tmp_coarse)
-          tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                               param.mg_global.location[param.level + 1]);
+        if (tmp_coarse.empty())
+          tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                                param.mg_global.location[param.level + 1]);
 
         // create coarse residual vector if not already created in verify()
-        if (!r_coarse)
-          r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                             param.mg_global.location[param.level + 1]);
+        if (r_coarse.empty())
+          r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                              param.mg_global.location[param.level + 1]);
 
         // create coarse solution vector if not already created in verify()
-        if (!x_coarse)
-          x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                             param.mg_global.location[param.level + 1]);
+        if (x_coarse.empty())
+          x_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                              param.mg_global.location[param.level + 1]);
 
         int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]);
         B_coarse.resize(nVec_coarse);
@@ -504,10 +504,10 @@ namespace quda
       xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new GaugeField(xinv_param)));
       xInvKD_sloppy->copy(*xInvKD);
 
-      ColorSpinorParam sloppy_tmp_param(*tmp_coarse);
+      ColorSpinorParam sloppy_tmp_param(tmp_coarse);
       sloppy_tmp_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition);
 
-      tmp_coarse_sloppy = new ColorSpinorField(sloppy_tmp_param);
+      tmp_coarse_sloppy = ColorSpinorField(sloppy_tmp_param);
 
     } else {
       // We can just alias fields
@@ -677,7 +677,7 @@ namespace quda
       param_coarse_solver->verbosity_precondition = param.mg_global.verbosity[param.level+1];
 
       // preconditioned solver wrapper is uniform precision
-      param_coarse_solver->precision = r_coarse->Precision();
+      param_coarse_solver->precision = r_coarse.Precision();
       param_coarse_solver->precision_sloppy = param_coarse_solver->precision;
       param_coarse_solver->precision_precondition = param_coarse_solver->precision_sloppy;
 
@@ -716,9 +716,9 @@ namespace quda
 
         // Run a dummy solve so that the deflation space is constructed and computed if needed during the MG setup,
         // or the eigenvalues are recomputed during transfer.
-        spinorNoise(*r_coarse, *coarse->rng, QUDA_NOISE_UNIFORM);
+        spinorNoise(r_coarse, *coarse->rng, QUDA_NOISE_UNIFORM);
         param_coarse_solver->maxiter = 1; // do a single iteration on the dummy solve
-        (*coarse_solver)(*x_coarse, *r_coarse);
+        (*coarse_solver)(x_coarse, r_coarse);
         setOutputPrefix(prefix); // restore since we just popped back from coarse grid
         param_coarse_solver->maxiter = param.mg_global.coarse_solver_maxiter[param.level + 1];
       }
@@ -760,14 +760,6 @@ namespace quda
 
     if (presmoother) delete presmoother;
     if (param_presmooth) delete param_presmooth;
-
-    if (b_tilde && param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) delete b_tilde;
-    if (r) delete r;
-    if (r_coarse) delete r_coarse;
-    if (x_coarse) delete x_coarse;
-    if (tmp_coarse) delete tmp_coarse;
-    if (tmp_coarse_sloppy) delete tmp_coarse_sloppy;
-
     if (param_coarse) delete param_coarse;
 
     popLevel();
@@ -786,7 +778,7 @@ namespace quda
     pushLevel(param.level);
 
     // temporary fields used for verification
-    ColorSpinorParam csParam(*r);
+    ColorSpinorParam csParam(r);
     csParam.create = QUDA_NULL_FIELD_CREATE;
     ColorSpinorField tmp1(csParam);
     ColorSpinorField tmp2(csParam);
@@ -815,14 +807,14 @@ namespace quda
         // as well as copying to the correct location this also changes basis if necessary
         tmp1 = param.B[i];
 
-        transfer->R(*r_coarse, tmp1);
-        transfer->P(tmp2, *r_coarse);
+        transfer->R(r_coarse, tmp1);
+        transfer->P(tmp2, r_coarse);
         auto max_deviation = blas::max_deviation(tmp2, tmp1);
         auto l2_deviation = sqrt(xmyNorm(tmp1, tmp2) / norm2(tmp1));
 
         logQuda(
           QUDA_VERBOSE, "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n",
-          i, norm2(tmp1), norm2(*r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]);
+          i, norm2(tmp1), norm2(r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]);
         if (check_deviation(l2_deviation, tol))
           errorQuda("k=%d orthonormality failed: L2 relative deviation %e > %e", i, l2_deviation, tol);
         if (check_deviation(max_deviation[0], tol))
@@ -839,10 +831,10 @@ namespace quda
         logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for %d vectors\n", param.Nvec);
 
         for (int i = 0; i < param.Nvec; i++) {
-          transfer->R(*r_coarse, param.B[i]);
-          (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
+          transfer->R(r_coarse, param.B[i]);
+          (*coarse_solver)(x_coarse, r_coarse);   // this needs to be an exact solve to pass
           setOutputPrefix(prefix);                // restore prefix after return from coarse grid
-          transfer->P(tmp2, *x_coarse);
+          transfer->P(tmp2, x_coarse);
           (*param.matResidual)(tmp1, tmp2);
           tmp2 = param.B[i];
           logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1));
@@ -859,10 +851,10 @@ namespace quda
             param.Nvec);
 
     for (int i=0; i<param.Nvec; i++) {
-      transfer->R(*r_coarse, param.B[i]);
-      (*coarse)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
+      transfer->R(r_coarse, param.B[i]);
+      (*coarse)(x_coarse, r_coarse); // this needs to be an exact solve to pass
       setOutputPrefix(prefix); // restore output prefix
-      transfer->P(tmp2, *x_coarse);
+      transfer->P(tmp2, x_coarse);
       param.matResidual(tmp1, tmp2);
       tmp2 = param.B[i];
       logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(param.B[i]), norm2(tmp1));
@@ -873,30 +865,30 @@ namespace quda
     // We need to create temporary coarse vectors here for various verifications
     // Otherwise these get created in the coarse `reset()` routine later
 
-    if (!tmp_coarse)
-      tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                           param.mg_global.location[param.level + 1]);
+    if (tmp_coarse.empty())
+      tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                            param.mg_global.location[param.level + 1]);
 
     // create coarse residual vector if not already created in verify()
-    if (!r_coarse)
-      r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                         param.mg_global.location[param.level + 1]);
+    if (r_coarse.empty())
+      r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                          param.mg_global.location[param.level + 1]);
 
     // create coarse solution vector if not already created in verify()
-    if (!x_coarse)
-      x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(),
-                                         param.mg_global.location[param.level + 1]);
+    if (x_coarse.empty())
+      x_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
+                                          param.mg_global.location[param.level + 1]);
 
     {
       logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P^\\dagger P) eta_c\n");
 
-      spinorNoise(*x_coarse, *rng, QUDA_NOISE_UNIFORM);
-      transfer->P(tmp2, *x_coarse);
-      transfer->R(*r_coarse, tmp2);
-      auto r2 = norm2(*r_coarse);
-      auto max_deviation = blas::max_deviation(*r_coarse, *x_coarse);
-      auto l2_deviation = sqrt(xmyNorm(*x_coarse, *r_coarse) / norm2(*x_coarse));
-      logQuda(QUDA_VERBOSE, "L2 norms %e %e (fine tmp %e); Deviations: L2 relative = %e, max = %e\n", norm2(*x_coarse),
+      spinorNoise(x_coarse, *rng, QUDA_NOISE_UNIFORM);
+      transfer->P(tmp2, x_coarse);
+      transfer->R(r_coarse, tmp2);
+      auto r2 = norm2(r_coarse);
+      auto max_deviation = blas::max_deviation(r_coarse, x_coarse);
+      auto l2_deviation = sqrt(xmyNorm(x_coarse, r_coarse) / norm2(x_coarse));
+      logQuda(QUDA_VERBOSE, "L2 norms %e %e (fine tmp %e); Deviations: L2 relative = %e, max = %e\n", norm2(x_coarse),
               r2, norm2(tmp2), l2_deviation, max_deviation[0]);
       if (check_deviation(l2_deviation, tol))
         errorQuda("coarse span failed: L2 relative deviation = %e > %e", l2_deviation, tol);
@@ -905,17 +897,17 @@ namespace quda
     }
 
     logQuda(QUDA_SUMMARIZE, "Checking 0 = (D_c - P^\\dagger D P) (native coarse operator to emulated operator)\n");
-    zero(*tmp_coarse);
-    zero(*r_coarse);
+    zero(tmp_coarse);
+    zero(r_coarse);
 
 #if 0 // debugging trick: point source matrix elements
-    tmp_coarse->Source(QUDA_POINT_SOURCE, 0, 0, 0);
+    tmp_coarse.Source(QUDA_POINT_SOURCE, 0, 0, 0);
 #else
-    spinorNoise(*tmp_coarse, *rng, QUDA_NOISE_UNIFORM);
+    spinorNoise(tmp_coarse, *rng, QUDA_NOISE_UNIFORM);
 #endif
 
     // put a non-trivial vector on the fine level as well
-    transfer->P(tmp1, *tmp_coarse);
+    transfer->P(tmp1, tmp_coarse);
 
     // the three-hop terms in ASQTAD can break the verification depending on how we're coarsening the operator
     // and if the aggregate size is too small in a direction
@@ -975,8 +967,8 @@ namespace quda
         (*param.matResidual)(tmp2, tmp1);
       }
 
-      transfer->R(*x_coarse, tmp2);
-      static_cast<DiracCoarse *>(diracCoarseResidual)->M(*r_coarse, *tmp_coarse);
+      transfer->R(x_coarse, tmp2);
+      static_cast<DiracCoarse *>(diracCoarseResidual)->M(r_coarse, tmp_coarse);
 
 #if 0 // enable to print out emulated and actual coarse-grid operator vectors for debugging
       setOutputPrefix("");
@@ -986,20 +978,20 @@ namespace quda
         printfQuda("\nemulated\n");
         comm_barrier();
         for (int parity = 0; parity < 2; parity++)
-          for (unsigned int x_cb = 0; x_cb < x_coarse->VolumeCB(); x_cb++) x_coarse->PrintVector(parity, x_cb, rank);
+          for (unsigned int x_cb = 0; x_cb < x_coarse.VolumeCB(); x_cb++) x_coarse.PrintVector(parity, x_cb, rank);
 
         comm_barrier();
         printfQuda("\nactual\n");
         comm_barrier();
         for (int parity = 0; parity < 2; parity++)
-          for (unsigned int x_cb = 0; x_cb < r_coarse->VolumeCB(); x_cb++) r_coarse->PrintVector(parity, x_cb, rank);
+          for (unsigned int x_cb = 0; x_cb < r_coarse.VolumeCB(); x_cb++) r_coarse.PrintVector(parity, x_cb, rank);
       }
       setOutputPrefix(prefix);
 #endif
 
-      double r_nrm = norm2(*r_coarse);
-      auto max_deviation = blas::max_deviation(*r_coarse, *x_coarse);
-      auto l2_deviation = sqrt(xmyNorm(*x_coarse, *r_coarse) / norm2(*x_coarse));
+      double r_nrm = norm2(r_coarse);
+      auto max_deviation = blas::max_deviation(r_coarse, x_coarse);
+      auto l2_deviation = sqrt(xmyNorm(x_coarse, r_coarse) / norm2(x_coarse));
 
       if (diracResidual->Mu() != 0.0) {
         // When the mu is shifted on the coarse level; we can compute exactly the error we introduce in the check:
@@ -1008,13 +1000,13 @@ namespace quda
         if (fabs(delta_factor) > tol) {
           double delta_a
             = delta_factor * 2.0 * diracResidual->Kappa() * diracResidual->Mu() * transfer->Vectors().TwistFlavor();
-          l2_deviation -= fabs(delta_a) * sqrt(norm2(*tmp_coarse) / norm2(*x_coarse));
+          l2_deviation -= fabs(delta_a) * sqrt(norm2(tmp_coarse) / norm2(x_coarse));
           l2_deviation = fabs(l2_deviation);
           max_deviation[0] -= fabs(delta_a);
         }
       }
       logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n",
-              norm2(*x_coarse), r_nrm, l2_deviation, max_deviation[0]);
+              norm2(x_coarse), r_nrm, l2_deviation, max_deviation[0]);
 
       if (check_deviation(l2_deviation, tol))
         errorQuda("Coarse operator failed: L2 relative deviation = %e > %e", l2_deviation, tol);
@@ -1028,14 +1020,14 @@ namespace quda
     if (coarse_was_preconditioned) {
       // check eo
       logQuda(QUDA_SUMMARIZE, "Checking Deo of preconditioned operator 0 = \\hat{D}_c - A^{-1} D_c\n");
-      static_cast<DiracCoarse *>(diracCoarseResidual)->Dslash(r_coarse->Even(), tmp_coarse->Odd(), QUDA_EVEN_PARITY);
-      static_cast<DiracCoarse *>(diracCoarseResidual)->CloverInv(x_coarse->Even(), r_coarse->Even(), QUDA_EVEN_PARITY);
-      static_cast<DiracCoarsePC *>(diracCoarseSmoother)->Dslash(r_coarse->Even(), tmp_coarse->Odd(), QUDA_EVEN_PARITY);
-      double r_nrm = norm2(r_coarse->Even());
-      auto max_deviation = blas::max_deviation(r_coarse->Even(), x_coarse->Even());
-      auto l2_deviation = sqrt(xmyNorm(x_coarse->Even(), r_coarse->Even()) / norm2(x_coarse->Even()));
+      static_cast<DiracCoarse *>(diracCoarseResidual)->Dslash(r_coarse.Even(), tmp_coarse.Odd(), QUDA_EVEN_PARITY);
+      static_cast<DiracCoarse *>(diracCoarseResidual)->CloverInv(x_coarse.Even(), r_coarse.Even(), QUDA_EVEN_PARITY);
+      static_cast<DiracCoarsePC *>(diracCoarseSmoother)->Dslash(r_coarse.Even(), tmp_coarse.Odd(), QUDA_EVEN_PARITY);
+      double r_nrm = norm2(r_coarse.Even());
+      auto max_deviation = blas::max_deviation(r_coarse.Even(), x_coarse.Even());
+      auto l2_deviation = sqrt(xmyNorm(x_coarse.Even(), r_coarse.Even()) / norm2(x_coarse.Even()));
       logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n",
-              norm2(x_coarse->Even()), r_nrm, l2_deviation, max_deviation[0]);
+              norm2(x_coarse.Even()), r_nrm, l2_deviation, max_deviation[0]);
       if (check_deviation(l2_deviation, tol))
         errorQuda("Preconditioned Deo failed: L2 relative deviation = %e > %e", l2_deviation, tol);
       if (check_deviation(max_deviation[0], tol))
@@ -1043,14 +1035,14 @@ namespace quda
 
       // check Doe
       logQuda(QUDA_SUMMARIZE, "Checking Doe of preconditioned operator 0 = \\hat{D}_c - A^{-1} D_c\n");
-      static_cast<DiracCoarse *>(diracCoarseResidual)->Dslash(r_coarse->Odd(), tmp_coarse->Even(), QUDA_ODD_PARITY);
-      static_cast<DiracCoarse *>(diracCoarseResidual)->CloverInv(x_coarse->Odd(), r_coarse->Odd(), QUDA_ODD_PARITY);
-      static_cast<DiracCoarsePC *>(diracCoarseSmoother)->Dslash(r_coarse->Odd(), tmp_coarse->Even(), QUDA_ODD_PARITY);
-      r_nrm = norm2(r_coarse->Odd());
-      max_deviation = blas::max_deviation(r_coarse->Odd(), x_coarse->Odd());
-      l2_deviation = sqrt(xmyNorm(x_coarse->Odd(), r_coarse->Odd()) / norm2(x_coarse->Odd()));
+      static_cast<DiracCoarse *>(diracCoarseResidual)->Dslash(r_coarse.Odd(), tmp_coarse.Even(), QUDA_ODD_PARITY);
+      static_cast<DiracCoarse *>(diracCoarseResidual)->CloverInv(x_coarse.Odd(), r_coarse.Odd(), QUDA_ODD_PARITY);
+      static_cast<DiracCoarsePC *>(diracCoarseSmoother)->Dslash(r_coarse.Odd(), tmp_coarse.Even(), QUDA_ODD_PARITY);
+      r_nrm = norm2(r_coarse.Odd());
+      max_deviation = blas::max_deviation(r_coarse.Odd(), x_coarse.Odd());
+      l2_deviation = sqrt(xmyNorm(x_coarse.Odd(), r_coarse.Odd()) / norm2(x_coarse.Odd()));
       logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n",
-              norm2(x_coarse->Odd()), r_nrm, l2_deviation, max_deviation[0]);
+              norm2(x_coarse.Odd()), r_nrm, l2_deviation, max_deviation[0]);
       if (check_deviation(l2_deviation, tol))
         errorQuda("Preconditioned Doe failed: L2 relative deviation = %e > %e", l2_deviation, tol);
       if (check_deviation(max_deviation[0], tol))
@@ -1107,12 +1099,12 @@ namespace quda
         for (int i = 0; i < param.Nvec; i++) {
 
           // Restrict Evec, place result in r_coarse
-          transfer->R(*r_coarse, param.B[i]);
+          transfer->R(r_coarse, param.B[i]);
           // Prolong r_coarse, place result in tmp2
-          transfer->P(tmp2, *r_coarse);
+          transfer->P(tmp2, r_coarse);
 
           printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(param.B[i]),
-                     norm2(*r_coarse), norm2(tmp2));
+                     norm2(r_coarse), norm2(tmp2));
 
           // Compare v_k and PP^dag v_k.
           auto max_deviation = blas::max_deviation(tmp2, param.B[i]);
@@ -1128,10 +1120,10 @@ namespace quda
             // Oblique projections
             logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for vector %d\n", i);
 
-            transfer->R(*r_coarse, param.B[i]);
-            (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass
+            transfer->R(r_coarse, param.B[i]);
+            (*coarse_solver)(x_coarse, r_coarse);   // this needs to be an exact solve to pass
             setOutputPrefix(prefix);                // restore prefix after return from coarse grid
-            transfer->P(tmp2, *x_coarse);
+            transfer->P(tmp2, x_coarse);
             (*param.matResidual)(tmp1, tmp2);
 
             logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(param.B[i]),
@@ -1194,8 +1186,7 @@ namespace quda
       diracSmoother->prepare(in, out, x, b, outer_solution_type);
 
       // b_tilde holds either a copy of preconditioned source or a pointer to original source
-      if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) *b_tilde = *in;
-      else b_tilde = &b;
+      if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) b_tilde = *in;
 
       if (presmoother) (*presmoother)(*out, *in); else zero(*out);
 
@@ -1214,8 +1205,8 @@ namespace quda
       // FIXME this is currently borked if inner solver is preconditioned
       ColorSpinorField &residual = !presmoother ? b :
         use_solver_residual                     ? presmoother->get_residual() :
-        b.SiteSubset() == QUDA_FULL_SITE_SUBSET ? *r :
-                                                  r->Even();
+        b.SiteSubset() == QUDA_FULL_SITE_SUBSET ? r :
+                                                  r.Even();
 
       if (!use_solver_residual && presmoother) {
         (*param.matResidual)(residual, x);
@@ -1228,32 +1219,33 @@ namespace quda
       if (transfer) {
 
         // restrict to the coarse grid
-        transfer->R(*r_coarse, residual);
-        if ( debug ) printfQuda("after pre-smoothing x2 = %e, r2 = %e, r_coarse2 = %e\n", norm2(x), r2, norm2(*r_coarse));
+        transfer->R(r_coarse, residual);
+        if (debug) printfQuda("after pre-smoothing x2 = %e, r2 = %e, r_coarse2 = %e\n", norm2(x), r2, norm2(r_coarse));
 
         // recurse to the next lower level
-        (*coarse_solver)(*x_coarse, *r_coarse);
-        if (debug) printfQuda("after coarse solve x_coarse2 = %e r_coarse2 = %e\n", norm2(*x_coarse), norm2(*r_coarse));
+        (*coarse_solver)(x_coarse, r_coarse);
+        if (debug) printfQuda("after coarse solve x_coarse2 = %e r_coarse2 = %e\n", norm2(x_coarse), norm2(r_coarse));
 
         // prolongate back to this grid
-        ColorSpinorField &x_coarse_2_fine = inner_solution_type == QUDA_MAT_SOLUTION ? *r : r->Even(); // define according to inner solution type
-        transfer->P(x_coarse_2_fine, *x_coarse); // repurpose residual storage
+        ColorSpinorField &x_coarse_2_fine
+          = inner_solution_type == QUDA_MAT_SOLUTION ? r : r.Even(); // define according to inner solution type
+        transfer->P(x_coarse_2_fine, x_coarse);                      // repurpose residual storage
         xpy(x_coarse_2_fine, solution); // sum to solution FIXME - sum should be done inside the transfer operator
         if ( debug ) {
-          printfQuda("Prolongated coarse solution y2 = %e\n", norm2(*r));
-          printfQuda("after coarse-grid correction x2 = %e, r2 = %e\n", norm2(x), norm2(*r));
+          printfQuda("Prolongated coarse solution y2 = %e\n", norm2(r));
+          printfQuda("after coarse-grid correction x2 = %e, r2 = %e\n", norm2(x), norm2(r));
         }
       }
 
       if (debug) printfQuda("preparing to post smooth\n");
 
       // do the post smoothing
-      //residual = outer_solution_type == QUDA_MAT_SOLUTION ? *r : r->Even(); // refine for outer solution type
+      // residual = outer_solution_type == QUDA_MAT_SOLUTION ? r : r.Even(); // refine for outer solution type
       if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) {
-        in = b_tilde;
+        in = &b_tilde;
       } else { // this incurs unecessary copying
-        *r = b;
-        in = r;
+        r = b;
+        in = &r;
       }
 
       // we should keep a copy of the prepared right hand side as we've already destroyed it
@@ -1276,9 +1268,9 @@ namespace quda
     }
 
     // FIXME on subset check
-    if (debug && b.SiteSubset() == r->SiteSubset()) {
-      (*param.matResidual)(*r, x);
-      double r2 = xmyNorm(b, *r);
+    if (debug && b.SiteSubset() == r.SiteSubset()) {
+      (*param.matResidual)(r, x);
+      double r2 = xmyNorm(b, r);
       printfQuda("leaving V-cycle with x2=%e, r2=%e\n", norm2(x), r2);
     }
 
@@ -1359,7 +1351,7 @@ namespace quda
     }
     solverParam.pipeline
       = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB
-    solverParam.precision = r->Precision();
+    solverParam.precision = r.Precision();
 
     if (is_fine_grid()) {
       solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition;
@@ -1371,7 +1363,7 @@ namespace quda
     solverParam.residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL);
     solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES;
     ColorSpinorParam csParam(B[0]);                             // Create spinor field parameters:
-    csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering
+    csParam.setPrecision(r.Precision(), r.Precision(), true);   // ensure native ordering
     csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now
     csParam.gammaBasis = B[0].Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS :
                                              QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered

From af341fe40caddadae5b5f0e7a914bbc35611d92c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Jan 2024 12:15:10 -0800
Subject: [PATCH 07/21] Fix for block ortho: prior commit accidentally
 introduced memory copies of the null space fields

---
 include/kernels/block_orthogonalize.cuh |  9 ++++-----
 lib/block_orthogonalize.in.cu           | 18 ++++++++----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index bec77273b7..348d56dcd8 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -51,15 +51,14 @@ namespace quda {
     static constexpr bool swizzle = false;
     int_fastdiv swizzle_factor; // for transposing blockIdx.x mapping to coarse grid coordinate
 
-    const Vector B[nVec];
+    Vector B[nVec];
 
     static constexpr bool launch_bounds = true;
     dim3 grid_dim;
     dim3 block_dim;
 
-    template <typename... T>
-    BlockOrthoArg(ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity,
-                  const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta, T... B) :
+    BlockOrthoArg(ColorSpinorField &V, const std::vector<ColorSpinorField> &B, const int *fine_to_coarse, const int *coarse_to_fine, int parity,
+                  const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta) :
       kernel_param(dim3(meta.VolumeCB() * (fineSpin > 1 ? meta.SiteSubset() : 1), 1, chiral_blocks)),
       V(V),
       fine_to_coarse(fine_to_coarse),
@@ -69,10 +68,10 @@ namespace quda {
       nParity(meta.SiteSubset()),
       nBlockOrtho(n_block_ortho),
       fineVolumeCB(meta.VolumeCB()),
-      B{B...},
       grid_dim(),
       block_dim()
     {
+      for (int i = 0; i < nVec; i++) this->B[i] = B[i];
       int aggregate_size = 1;
       for (int d = 0; d < V.Ndim(); d++) aggregate_size *= geo_bs[d];
       aggregate_size_cb = aggregate_size / 2;
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index e4e3a9e12c..e67e083b94 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -112,20 +112,18 @@ namespace quda {
       }
     }
 
-    template <typename Rotator, typename Vector, std::size_t... S>
-    void launch_host_(const TuneParam &tp, const qudaStream_t &stream, const std::vector<ColorSpinorField> &B,
-                      std::index_sequence<S...>)
+    template <typename Rotator, typename Vector>
+    void launch_host_(const TuneParam &tp, const qudaStream_t &stream)
     {
-      Arg<false, Rotator, Vector> arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
+      Arg<false, Rotator, Vector> arg(V, B, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V);
       launch_host<BlockOrtho_, OrthoAggregates>(tp, stream, arg);
       if (two_pass && iter == 0 && V.Precision() < QUDA_SINGLE_PRECISION && !activeTuning()) max = Rotator(V).abs_max(V);
     }
 
-    template <typename Rotator, typename Vector, std::size_t... S>
-    void launch_device_(const TuneParam &tp, const qudaStream_t &stream, const std::vector<ColorSpinorField> &B,
-                        std::index_sequence<S...>)
+    template <typename Rotator, typename Vector>
+    void launch_device_(const TuneParam &tp, const qudaStream_t &stream)
     {
-      Arg<true, Rotator, Vector> arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
+      Arg<true, Rotator, Vector> arg(V, B, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V);
       arg.swizzle_factor = tp.aux.x;
       launch_device<BlockOrtho_, OrthoAggregates>(tp, stream, arg);
       if (two_pass && iter == 0 && V.Precision() < QUDA_SINGLE_PRECISION && !activeTuning()) max = Rotator(V).abs_max(V);
@@ -140,7 +138,7 @@ namespace quda {
             && B[0].FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
           typedef FieldOrderCB<real,nSpin,nColor,nVec,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,vFloat,vFloat,disable_ghost> Rotator;
           typedef FieldOrderCB<real,nSpin,nColor,1,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,bFloat,bFloat,disable_ghost> Vector;
-          launch_host_<Rotator, Vector>(tp, stream, B, std::make_index_sequence<nVec>());
+          launch_host_<Rotator, Vector>(tp, stream);
         } else {
           errorQuda("Unsupported field order %d", V.FieldOrder());
         }
@@ -150,7 +148,7 @@ namespace quda {
         if (V.FieldOrder() == vOrder && B[0].FieldOrder() == bOrder) {
           typedef FieldOrderCB<real,nSpin,nColor,nVec,vOrder,vFloat,vFloat,disable_ghost> Rotator;
           typedef FieldOrderCB<real,nSpin,nColor,1,bOrder,bFloat,bFloat,disable_ghost,isFixed<bFloat>::value> Vector;
-          launch_device_<Rotator, Vector>(tp, stream, B, std::make_index_sequence<nVec>());
+          launch_device_<Rotator, Vector>(tp, stream);
         } else {
           errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0].FieldOrder());
         }

From 6b8ddcf5c2d0d6c1c6a77e9cb48f732e5cf90b5e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Jan 2024 12:16:02 -0800
Subject: [PATCH 08/21] Add n_rhs to restrictor/prolongator aux string

---
 lib/prolongator.in.cu | 4 ++++
 lib/restrictor.in.cu  | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu
index 8320ca5e8f..d9e16b5f81 100644
--- a/lib/prolongator.in.cu
+++ b/lib/prolongator.in.cu
@@ -33,6 +33,10 @@ namespace quda {
       strcat(vol, out[0].VolString().c_str());
       strcat(aux, ",");
       strcat(aux, out[0].AuxString().c_str());
+      strcat(aux, ",n_rhs=");
+      char rhs_str[16];
+      i32toa(rhs_str, out.size());
+      strcat(aux, rhs_str);
 
       apply(device::get_default_stream());
     }
diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index 27a1bd5c5d..281d5fb0bf 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -38,6 +38,10 @@ namespace quda {
       strcat(vol, out[0].VolString().c_str());
       strcat(aux, ",");
       strcat(aux, out[0].AuxString().c_str());
+      strcat(aux, ",n_rhs=");
+      char rhs_str[16];
+      i32toa(rhs_str, out.size());
+      strcat(aux, rhs_str);
 
       apply(device::get_default_stream());
     }
@@ -99,7 +103,7 @@ namespace quda {
       param.aux.x = 2; // swizzle factor
     }
 
-    long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset()*in[0].VolumeCB(); }
+    long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset() * in[0].VolumeCB(); }
 
     long long bytes() const
     {

From 5351201b12d911f9dcfb31aafb796108ee0fdba7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 06:12:17 -0800
Subject: [PATCH 09/21] Prolongator and restrictor can now do gamma basis on
 the fly for fine-grid fields with nSpin=4

---
 include/color_spinor_field_order.h |  2 +-
 include/kernels/prolongator.cuh    | 24 ++++++++++++++++++----
 include/kernels/restrictor.cuh     | 33 +++++++++++++++---------------
 lib/prolongator.in.cpp             |  4 ++++
 lib/prolongator.in.cu              | 14 +++++++++++--
 lib/restrictor.in.cpp              |  4 ++++
 lib/restrictor.in.cu               | 24 +++++++++++++++++-----
 lib/transfer.cpp                   | 16 +++------------
 8 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 022d2a2b69..1f156d73d1 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -914,7 +914,7 @@ namespace quda
        */
       template <int nSpinBlock>
       __device__ __host__ inline void load(complex<Float> out[nSpinBlock * nColor * nVec], int parity, int x_cb,
-                                           int chi) const
+                                           int chi = 0) const
       {
         if (!fixed) {
           accessor.template load<nSpinBlock>((complex<storeFloat> *)out, v.v, parity, x_cb, chi, volumeCB);
diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh
index d4296d41b1..515c409e2c 100644
--- a/include/kernels/prolongator.cuh
+++ b/include/kernels/prolongator.cuh
@@ -1,4 +1,5 @@
 #include <color_spinor_field_order.h>
+#include <color_spinor.h>
 #include <multigrid_helper.cuh>
 #include <kernel.h>
 
@@ -14,13 +15,14 @@ namespace quda {
   /** 
       Kernel argument struct
   */
-  template <typename Float, typename vFloat, int fineSpin_, int fineColor_, int coarseSpin_, int coarseColor_>
+  template <typename Float, typename vFloat, int fineSpin_, int fineColor_, int coarseSpin_, int coarseColor_, bool to_non_rel_>
   struct ProlongateArg : kernel_param<> {
     using real = Float;
     static constexpr int fineSpin = fineSpin_;
     static constexpr int coarseSpin = coarseSpin_;
     static constexpr int fineColor = fineColor_;
     static constexpr int coarseColor = coarseColor_;
+    static constexpr bool to_non_rel = to_non_rel_;
 
     // disable ghost to reduce arg size
     using F = FieldOrderCB<Float, fineSpin, fineColor, 1, colorspinor::getNative<Float>(fineSpin), Float, Float, true>;
@@ -36,7 +38,7 @@ namespace quda {
     const spin_mapper<fineSpin,coarseSpin> spin_map;
     const int parity; // the parity of the output field (if single parity)
     const int nParity; // number of parities of input fine field
-
+    
     ProlongateArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &v,
                   const int *geo_map,  const int parity) :
       kernel_param(dim3(out[0].VolumeCB(), out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread<fineColor, coarseColor>())),
@@ -88,6 +90,8 @@ namespace quda {
 
     constexpr int color_unroll = 2;
 
+    ColorSpinor<typename Arg::real, fine_color_per_thread, Arg::fineSpin> out;
+
 #pragma unroll
     for (int s=0; s<Arg::fineSpin; s++) {
 #pragma unroll
@@ -107,11 +111,23 @@ namespace quda {
         }
 
 #pragma unroll
-        for (int k=1; k<color_unroll; k++) partial[0] += partial[k];
-        arg.out[src_idx](spinor_parity, x_cb, s, i) = partial[0];
+        for (int k = 0; k < color_unroll; k++) out(s, fine_color_local) += partial[k];
       }
     }
 
+    if constexpr (Arg::fineSpin == 4 && Arg::to_non_rel) {
+      out.toNonRel();
+      out *= rsqrt(static_cast<typename Arg::real>(2.0));
+    }
+
+#pragma unroll
+    for (int s = 0; s < Arg::fineSpin; s++) {
+#pragma unroll
+      for (int fine_color_local = 0; fine_color_local < fine_color_per_thread; fine_color_local++) {
+        int i = fine_color_block + fine_color_local; // global fine color index
+        arg.out[src_idx](spinor_parity, x_cb, s, i) = out(s, fine_color_local);
+      }
+    }
   }
 
   template <typename Arg> struct Prolongator
diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index 9da7633071..ed2e70d580 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -21,13 +21,14 @@ namespace quda {
   /** 
       Kernel argument struct
   */
-  template <typename Float, typename vFloat, int fineSpin_, int fineColor_, int coarseSpin_, int coarseColor_>
+  template <typename Float, typename vFloat, int fineSpin_, int fineColor_, int coarseSpin_, int coarseColor_, bool from_non_rel_>
   struct RestrictArg : kernel_param<> {
     using real = Float;
     static constexpr int fineSpin = fineSpin_;
     static constexpr int fineColor = fineColor_;
     static constexpr int coarseSpin = coarseSpin_;
     static constexpr int coarseColor = coarseColor_;
+    static constexpr bool from_non_rel = from_non_rel_;
 
     // disable ghost to reduce arg size
     using F = FieldOrderCB<Float, fineSpin, fineColor, 1, colorspinor::getNative<Float>(fineSpin), Float, Float, true>;
@@ -52,6 +53,7 @@ namespace quda {
     static constexpr bool swizzle = true;
     int_fastdiv swizzle_factor; // for transposing blockIdx.x mapping to coarse grid coordinate
 
+    // when setting n_vector_z we can only use the compile-time part so can't include n_src in this min (not really an issue)
     static constexpr int n_vector_z = std::min(coarseColor/coarse_colors_per_thread<fineColor, coarseColor>(), max_z_block());
     static_assert(n_vector_z > 0, "n_vector_z cannot be less than 1");
 
@@ -92,33 +94,30 @@ namespace quda {
     const int v_parity = (arg.v.Nparity() == 2) ? parity : 0;
 
 #pragma unroll
-    for (int s=0; s<Arg::fineSpin; s++)
+    for (int s = 0; s < Arg::fineSpin; s++)
 #pragma unroll
       for (int coarse_color_local=0; coarse_color_local<coarse_color_per_thread; coarse_color_local++) {
 	out[s*coarse_color_per_thread+coarse_color_local] = 0.0;
       }
 
 #pragma unroll
-    for (int coarse_color_local=0; coarse_color_local<coarse_color_per_thread; coarse_color_local++) {
+    for (int coarse_color_local = 0; coarse_color_local < coarse_color_per_thread; coarse_color_local++) {
       int i = coarse_color_block + coarse_color_local;
-#pragma unroll
-      for (int s=0; s<Arg::fineSpin; s++) {
-
-	constexpr int color_unroll = Arg::fineColor == 3 ? 3 : 2;
 
-	complex<typename Arg::real> partial[color_unroll];
-#pragma unroll
-	for (int k=0; k<color_unroll; k++) partial[k] = 0.0;
+      ColorSpinor<typename Arg::real, Arg::fineColor, Arg::fineSpin> in;
+      arg.in[src_idx].load<Arg::fineSpin>(in.data, spinor_parity, x_cb);
 
-#pragma unroll
-	for (int j=0; j<Arg::fineColor; j+=color_unroll) {
-#pragma unroll
-	  for (int k=0; k<color_unroll; k++)
-	    partial[k] = cmac(conj(arg.v(v_parity, x_cb, s, j+k, i)), arg.in[src_idx](spinor_parity, x_cb, s, j+k), partial[k]);
-	}
+      if constexpr (Arg::fineSpin == 4 && Arg::from_non_rel) {
+        in.toRel();
+        in *= rsqrt(static_cast<typename Arg::real>(2.0));
+      }
 
 #pragma unroll
-	for (int k=0; k<color_unroll; k++) out[s*coarse_color_per_thread + coarse_color_local] += partial[k];
+      for (int s = 0; s < Arg::fineSpin; s++) {
+	for (int c = 0; c < Arg::fineColor; c++) {
+          out[s * coarse_color_per_thread + coarse_color_local] =
+            cmac(conj(arg.v(v_parity, x_cb, s, c, i)), in(s, c), out[s * coarse_color_per_thread + coarse_color_local]);
+        }
       }
     }
   }
diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp
index 0f05524289..286c18ba11 100644
--- a/lib/prolongator.in.cpp
+++ b/lib/prolongator.in.cpp
@@ -47,6 +47,10 @@ namespace quda
                   const int *fine_to_coarse, const int *const *spin_map, int parity)
   {
     if constexpr (is_enabled_multigrid()) {
+      if (v.Nspin() != 1 && in[0].GammaBasis() != v.GammaBasis())
+        errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d",
+                  out[0].GammaBasis(), in[0].GammaBasis(), v.GammaBasis());
+
       // clang-format off
       IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors;
       // clang-format on
diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu
index d9e16b5f81..5fcdbd3828 100644
--- a/lib/prolongator.in.cu
+++ b/lib/prolongator.in.cu
@@ -7,7 +7,8 @@ namespace quda {
 
   template <typename Float, typename vFloat, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
   class ProlongateLaunch : public TunableKernel3D {
-    using Arg = ProlongateArg<Float,vFloat,fineSpin,fineColor,coarseSpin,coarseColor>;
+    template <bool to_non_rel>
+    using Arg = ProlongateArg<Float, vFloat, fineSpin, fineColor, coarseSpin, coarseColor, to_non_rel>;
 
     cvector_ref<ColorSpinorField> &out;
     cvector_ref<const ColorSpinorField> &in;
@@ -37,6 +38,7 @@ namespace quda {
       char rhs_str[16];
       i32toa(rhs_str, out.size());
       strcat(aux, rhs_str);
+      if (out[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",to_non_rel");
 
       apply(device::get_default_stream());
     }
@@ -45,7 +47,15 @@ namespace quda {
     {
       if (checkNative(out[0], in[0], V)) {
         TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-        launch<Prolongator>(tp, stream, Arg(out, in, V, fine_to_coarse, parity));
+        if constexpr (fineSpin == 4) {
+          if (out[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) {
+            launch<Prolongator>(tp, stream, Arg<true>(out, in, V, fine_to_coarse, parity));
+          } else {
+            launch<Prolongator>(tp, stream, Arg<false>(out, in, V, fine_to_coarse, parity));
+          }
+        } else {
+          launch<Prolongator>(tp, stream, Arg<false>(out, in, V, fine_to_coarse, parity));
+        }
       }
     }
 
diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp
index 2f6d2697e0..ca40557846 100644
--- a/lib/restrictor.in.cpp
+++ b/lib/restrictor.in.cpp
@@ -47,6 +47,10 @@ namespace quda
                 const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity)
   {
     if constexpr (is_enabled_multigrid()) {
+      if (v.Nspin() != 1 && out[0].GammaBasis() != v.GammaBasis())
+        errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d",
+                  out[0].GammaBasis(), in[0].GammaBasis(), v.GammaBasis());
+
       // clang-format off
       IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors;
       // clang-format on
diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index 281d5fb0bf..c56482fb49 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -15,7 +15,8 @@ namespace quda {
 
   template <typename Float, typename vFloat, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
   class RestrictLaunch : public TunableBlock2D {
-    using Arg = RestrictArg<Float, vFloat, fineSpin, fineColor, coarseSpin, coarseColor>;
+    template <bool from_non_rel>
+    using Arg = RestrictArg<Float, vFloat, fineSpin, fineColor, coarseSpin, coarseColor, from_non_rel>;
     cvector_ref<ColorSpinorField> &out;
     cvector_ref<const ColorSpinorField> &in;
     const ColorSpinorField &v;
@@ -42,6 +43,7 @@ namespace quda {
       char rhs_str[16];
       i32toa(rhs_str, out.size());
       strcat(aux, rhs_str);
+      if (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",from_non_rel");
 
       apply(device::get_default_stream());
     }
@@ -50,15 +52,27 @@ namespace quda {
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (checkNative(out[0], in[0], v)) {
-        Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
-        arg.swizzle_factor = tp.aux.x;
-        launch<Restrictor, Aggregates>(tp, stream, arg);
+        if constexpr (fineSpin == 4) {
+          if (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) {
+            Arg<true> arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
+            arg.swizzle_factor = tp.aux.x;
+            launch<Restrictor, Aggregates>(tp, stream, arg);
+          } else {
+            Arg<false> arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
+            arg.swizzle_factor = tp.aux.x;
+            launch<Restrictor, Aggregates>(tp, stream, arg);
+          }
+        } else {
+          Arg<false> arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
+          arg.swizzle_factor = tp.aux.x;
+          launch<Restrictor, Aggregates>(tp, stream, arg);
+        }
       }
     }
 
     bool advanceAux(TuneParam &param) const
     {
-      if (Arg::swizzle) {
+      if (Arg<false>::swizzle) {
         if (param.aux.x < 2 * (int)device::processor_count()) {
           param.aux.x++;
           return true;
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index d67c4c8228..cb74a57304 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -377,7 +377,7 @@ namespace quda {
         }
 
         // set output fields
-        if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) {
+        if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) {
           for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
@@ -402,11 +402,6 @@ namespace quda {
       if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET)
         errorQuda("Cannot prolongate to a full field since only have single parity null-space components");
 
-      if ((V->Nspin() != 1) && ((output[0].GammaBasis() != V->GammaBasis()) || (input[0].GammaBasis() != V->GammaBasis()))) {
-        errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d",
-                  output[0].GammaBasis(), in[0].GammaBasis(), V->GammaBasis());
-      }
-
       Prolongate(output, input, *V, fine_to_coarse, spin_map, parity);
 
       for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically)
@@ -422,7 +417,7 @@ namespace quda {
   {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1);
+    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size());
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
     const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
 
@@ -450,12 +445,11 @@ namespace quda {
       if (use_gpu) {
 
         // set input fields
-        if (in[0].Location() == QUDA_CPU_FIELD_LOCATION || in[0].GammaBasis() != V->GammaBasis()) {
+        if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) {
           for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
         } else {
           for (auto i = 0u; i < out.size(); i++) input[i] = const_cast<ColorSpinorField &>(in[i]).create_alias();
         }
-        if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
 
         // set output fields
         if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) {
@@ -484,10 +478,6 @@ namespace quda {
       if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in[0].SiteSubset() == QUDA_FULL_SITE_SUBSET)
         errorQuda("Cannot restrict a full field since only have single parity null-space components");
 
-      if (V->Nspin() != 1 && (output[0].GammaBasis() != V->GammaBasis() || input[0].GammaBasis() != V->GammaBasis()))
-        errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d",
-                  out[0].GammaBasis(), input[0].GammaBasis(), V->GammaBasis());
-
       Restrict(output, input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity);
 
       for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically)

From e58f391238fd2099e9a4d3df6f16f9972a27a76d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 06:26:53 -0800
Subject: [PATCH 10/21] Fix bug with restrictor where too large block size
 could be chosen (leading to inactive threads)

---
 lib/restrictor.in.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index c56482fb49..ee067bac3e 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -95,7 +95,7 @@ namespace quda {
     {
       auto aggregate_size = in[0].Volume() / out[0].Volume();
       auto max_block = 128u;
-      for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size < b) return b;
+      for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size <= b) return b;
       return max_block;
     }
 

From acbd898a327c7ab5317f82aab22eec0dd73ea1ad Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 07:04:20 -0800
Subject: [PATCH 11/21] Some tweaks to improve restrictor m-rhs perf

---
 include/kernels/restrictor.cuh | 2 +-
 lib/restrictor.in.cu           | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index ed2e70d580..aee1601915 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -16,7 +16,7 @@ namespace quda {
     //coarseColor >= 8 && coarseColor % 8 == 0 ? 8 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
   }
 
-  constexpr int max_z_block() { return 8; }
+  constexpr int max_z_block() { return 12; }
 
   /** 
       Kernel argument struct
diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index ee067bac3e..184cfca15f 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -94,7 +94,8 @@ namespace quda {
     unsigned int blockMapper() const
     {
       auto aggregate_size = in[0].Volume() / out[0].Volume();
-      auto max_block = 128u;
+      // for multi-RHS use minimum block size to allow more srcs per block
+      auto max_block = in.size() > 1 ? blockMin() : 64u;
       for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size <= b) return b;
       return max_block;
     }

From f06b154f3450984c9522118ac29dcdb740fe7d23 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 10:04:58 -0800
Subject: [PATCH 12/21] Remove device temporaries from Transfer class - these
 are only needed if switching locations, and are now created as needed

---
 include/field_cache.h |  2 +-
 include/transfer.h    | 27 ++++++++---------
 lib/transfer.cpp      | 68 +++++++++++++++++++++----------------------
 3 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/include/field_cache.h b/include/field_cache.h
index 8d4a051ac7..5e05f9acf6 100644
--- a/include/field_cache.h
+++ b/include/field_cache.h
@@ -123,7 +123,7 @@ namespace quda {
      @param[in] a Vector of fields we wish to create a matching
      temporary for
    */
-  template <typename T> auto getFieldTmp(const vector_ref<T> &a)
+  template <typename T> auto getFieldTmp(cvector_ref<T> &a)
   {
     std::vector<FieldTmp<T>> tmp;
     tmp.reserve(a.size());
diff --git a/include/transfer.h b/include/transfer.h
index 22c4ae1a88..414e90decd 100644
--- a/include/transfer.h
+++ b/include/transfer.h
@@ -53,16 +53,10 @@ namespace quda {
     mutable ColorSpinorField *V_d = nullptr;
 
     /** A CPU temporary field with fine geometry and fine color we use for changing gamma basis */
-    mutable std::vector<ColorSpinorField> fine_tmp_h;
-
-    /** A GPU temporary field with fine geometry and fine color we use for changing gamma basis */
-    mutable std::vector<ColorSpinorField> fine_tmp_d;
+    mutable ColorSpinorField fine_tmp_h;
 
     /** A CPU temporary field with coarse geometry and coarse color */
-    mutable std::vector<ColorSpinorField> coarse_tmp_h;
-
-    /** A GPU temporary field with coarse geometry and coarse color we use for CPU input / output */
-    mutable std::vector<ColorSpinorField> coarse_tmp_d;
+    mutable ColorSpinorField coarse_tmp_h;
 
     /** The geometrical coase grid blocking */
     int *geo_bs = nullptr;
@@ -125,11 +119,19 @@ namespace quda {
     void createV(QudaFieldLocation location) const;
 
     /**
-     * @brief Allocate temporaries used when applying transfer operators
+     * @brief Allocate host temporaries used when applying transfer operators
      * @param[in] location Where to allocate the temporaries
-     * @param[in] n_src Number of temporaries to allocate
      */
-    void createTmp(QudaFieldLocation location, size_t n_src) const;
+    void createTmp() const;
+
+    /**
+     * @brief Allocate temporaries needed for prolongator / restrictor
+     * application (if changing location)
+     * @param[in,out] tmp Storaage for temporaries
+     * @param[in] location Location for new temporaries
+     * @param[in] a Field whose metadata we are cloning (aside from location and ordering)
+     */
+    void createTmp(std::vector<ColorSpinorField> &tmp, QudaFieldLocation new_location, ColorSpinorField &a) const;
 
     /**
      * @brief Creates the map between fine and coarse grids
@@ -146,9 +148,8 @@ namespace quda {
     /**
      * @brief Lazy allocation of the transfer operator in a given location
      * @param[in] location Where to allocate the temporaries
-     * @param[in] n_src Number of temporaries to allocate
      */
-    void initializeLazy(QudaFieldLocation location, size_t n_src) const;
+    void initializeLazy(QudaFieldLocation location) const;
 
   public:
     /**
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index cb74a57304..fc48695190 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -96,7 +96,7 @@ namespace quda {
     }
 
     createV(B[0].Location());              // allocate V field
-    createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation)
+    createTmp();                           // allocate temporaries (needed for geomap creation)
 
     // allocate and compute the fine-to-coarse and coarse-to-fine site maps
     fine_to_coarse_h = static_cast<int *>(pool_pinned_malloc(B[0].Volume() * sizeof(int)));
@@ -160,45 +160,33 @@ namespace quda {
     postTrace();
   }
 
-  void Transfer::createTmp(QudaFieldLocation location, size_t n_src) const
+  void Transfer::createTmp() const
   {
     // The CPU temporaries are needed for creating geometry mappings.
     if ((transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD
-         || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)
-        && location != QUDA_CPU_FIELD_LOCATION) {
+         || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG))
       return;
-    }
 
-    if (location == QUDA_CUDA_FIELD_LOCATION && fine_tmp_d.size() == n_src && coarse_tmp_d.size() == n_src) return;
-    if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return;
+    if (!fine_tmp_h.empty()) return;
 
     postTrace();
     ColorSpinorParam param(B[0]);
     param.create = QUDA_NULL_FIELD_CREATE;
-    param.location = location;
-    param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) :
-                                                              QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+    param.location = QUDA_CPU_FIELD_LOCATION;
+    param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
     if (param.Precision() < QUDA_SINGLE_PRECISION) param.setPrecision(QUDA_SINGLE_PRECISION);
 
-    if (location == QUDA_CUDA_FIELD_LOCATION) {
-      resize(fine_tmp_d, n_src, param);
-      coarse_tmp_d.push_back(fine_tmp_d[0].create_coarse(geo_bs, spin_bs, Nvec));
-      resize(coarse_tmp_d, n_src, ColorSpinorParam(coarse_tmp_d[0]));
-    } else {
-      resize(fine_tmp_h, n_src, param);
-      coarse_tmp_h.push_back(fine_tmp_h[0].create_coarse(geo_bs, spin_bs, Nvec));
-      resize(coarse_tmp_h, n_src, ColorSpinorParam(coarse_tmp_h[0]));
-    }
+    fine_tmp_h = ColorSpinorParam(param);
+    coarse_tmp_h = fine_tmp_h.create_coarse(geo_bs, spin_bs, Nvec);
+
     postTrace();
   }
 
-  void Transfer::initializeLazy(QudaFieldLocation location, size_t n_src) const
+  void Transfer::initializeLazy(QudaFieldLocation location) const
   {
     if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized");
 
-    // delayed allocating this temporary until we need it
-    // if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src);
-    createTmp(location, n_src);
+    createTmp();
 
     switch (location) {
     case QUDA_CUDA_FIELD_LOCATION:
@@ -289,8 +277,8 @@ namespace quda {
 
     int x[QUDA_MAX_DIM];
 
-    ColorSpinorField &fine(fine_tmp_h[0]);
-    ColorSpinorField &coarse(coarse_tmp_h[0]);
+    ColorSpinorField &fine(fine_tmp_h);
+    ColorSpinorField &coarse(coarse_tmp_h);
 
     // compute the coarse grid point for every site (assuming parity ordering currently)
     for (size_t i = 0; i < fine.Volume(); i++) {
@@ -340,11 +328,22 @@ namespace quda {
     }
   }
 
+  void Transfer::createTmp(std::vector<ColorSpinorField> &tmp, QudaFieldLocation new_location, ColorSpinorField &a) const
+  {
+    ColorSpinorParam param(a);
+    param.location = QUDA_CUDA_FIELD_LOCATION;
+    param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; // set to CPU order and override below if needed
+    param.setPrecision(param.Precision(), param.Precision(), new_location == QUDA_CUDA_FIELD_LOCATION ? true : false);
+    // ideally we'd want to be able to have tmp[0] on the temp stack as well
+    tmp[0] = ColorSpinorParam(param);
+    for (auto i = 1u; i < tmp.size(); i++) tmp[i] = getFieldTmp(tmp[0]);
+  }
+
   // apply the prolongator
   void Transfer::P(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size());
+    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION);
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
 
     if (transfer_type == QUDA_TRANSFER_COARSE_KD) {
@@ -371,18 +370,19 @@ namespace quda {
 
         // set input fields
         if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) {
-          for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d[i].create_alias();
+          createTmp(input, QUDA_CUDA_FIELD_LOCATION, coarse_tmp_h);
         } else {
           for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField&>(in[i]).create_alias();
         }
 
         // set output fields
         if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
+          createTmp(output, QUDA_CUDA_FIELD_LOCATION, fine_tmp_h);
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
         if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
+
       } else {
 
         // set input fields
@@ -390,7 +390,7 @@ namespace quda {
 
         // set output fields
         if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
+          createTmp(output, QUDA_CPU_FIELD_LOCATION, fine_tmp_h);
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -417,7 +417,7 @@ namespace quda {
   {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
 
-    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size());
+    initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION);
     const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
     const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
 
@@ -446,14 +446,14 @@ namespace quda {
 
         // set input fields
         if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) {
-          for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias();
+          createTmp(input, QUDA_CUDA_FIELD_LOCATION, fine_tmp_h);
         } else {
           for (auto i = 0u; i < out.size(); i++) input[i] = const_cast<ColorSpinorField &>(in[i]).create_alias();
         }
 
         // set output fields
         if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) {
-          for (auto i = 0u; i < out.size(); i++) output[i] = coarse_tmp_d[i].create_alias();
+          createTmp(output, QUDA_CUDA_FIELD_LOCATION, coarse_tmp_h);
         } else {
           for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias();
         }
@@ -462,15 +462,13 @@ namespace quda {
 
         // set input fields
         if (in[0].Location() == QUDA_CUDA_FIELD_LOCATION) {
-          for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias();
+          createTmp(input, QUDA_CPU_FIELD_LOCATION, coarse_tmp_h);
         } else {
           for (auto i = 0u; i < in.size(); i++) input[i] = const_cast<ColorSpinorField &>(in[i]).create_alias();
         }
 
         // set output fields
-        // set input fields
         for (auto i = 0u; i < out.size(); i++) output[i] = const_cast<ColorSpinorField&>(out[i]).create_alias();
-
       }
 
       for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not?

From 0c38bef0fef01e9a18975538358db1a33c50547d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 11:56:12 -0800
Subject: [PATCH 13/21] getFieldTmp<ColorSpinorField> will now change bais on
 the fly to match the desired kind

---
 include/color_spinor_field.h | 1 +
 lib/field_cache.cpp          | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index d91531b900..453a5e8df4 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -695,6 +695,7 @@ namespace quda
     QudaSiteOrder SiteOrder() const { return siteOrder; }
     QudaFieldOrder FieldOrder() const { return fieldOrder; }
     QudaGammaBasis GammaBasis() const { return gammaBasis; }
+    void GammaBasis(QudaGammaBasis new_basis) { gammaBasis = new_basis; }
 
     const int *GhostFace() const { return ghostFace.data; }
     const int *GhostFaceCB() const { return ghostFaceCB.data; }
diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp
index 9089a96f0c..5ea09e3059 100644
--- a/lib/field_cache.cpp
+++ b/lib/field_cache.cpp
@@ -17,6 +17,10 @@ namespace quda {
       param.create = QUDA_ZERO_FIELD_CREATE;
       tmp = T(param);
     }
+
+    if constexpr (std::is_same_v<T, ColorSpinorField>) {
+      tmp.GammaBasis(a.GammaBasis()); // ensure gamma basis matches
+    }
   }
 
   template <typename T> FieldTmp<T>::FieldTmp(const FieldKey<T> &key, const typename T::param_type &param) : key(key)

From 540d5f38787972d401a737be6b4163765e4669ba Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 12:01:31 -0800
Subject: [PATCH 14/21] Enable use of multi-RHS prolongator and restrictor for
 initial orthonormality check in MG::verify() (and some temporary cleanup)

---
 include/multigrid.h |  9 -----
 lib/multigrid.cpp   | 89 ++++++++++++++++++++-------------------------
 2 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/include/multigrid.h b/include/multigrid.h
index a369789dfc..35253bd0a6 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -297,21 +297,12 @@ namespace quda {
     /** Residual vector */
     ColorSpinorField r;
 
-    /** Projected source vector for preconditioned system, else just points to source */
-    ColorSpinorField b_tilde;
-
     /** Coarse residual vector */
     ColorSpinorField r_coarse;
 
     /** Coarse solution vector */
     ColorSpinorField x_coarse;
 
-    /** Coarse temporary vector */
-    ColorSpinorField tmp_coarse;
-
-    /** Sloppy coarse temporary vector */
-    ColorSpinorField tmp_coarse_sloppy;
-
     /** Kahler-Dirac Xinv */
     std::shared_ptr<GaugeField> xInvKD;
 
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index fac28198d3..63cd635c34 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -47,13 +47,6 @@ namespace quda
       if (param.B[0].Nspin() == 1)
         csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks
       r = ColorSpinorField(csParam);
-
-      // if we're using preconditioning then allocate storage for the preconditioned source vector
-      if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) {
-      	csParam.x[0] /= 2;
-      	csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-        b_tilde = ColorSpinorField(csParam);
-      }
     }
 
     rng = new RNG(param.B[0], 1234);
@@ -138,11 +131,6 @@ namespace quda
         for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++)
           param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i];
 
-        // create coarse temporary vector if not already created in verify()
-        if (tmp_coarse.empty())
-          tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
-                                                param.mg_global.location[param.level + 1]);
-
         // create coarse residual vector if not already created in verify()
         if (r_coarse.empty())
           r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
@@ -184,9 +172,6 @@ namespace quda
     createSmoother();
 
     if (param.level < param.Nlevel-1) {
-      // If enabled, verify the coarse links and fine solvers were correctly built
-      if (param.mg_global.run_verify) verify();
-
       // creating or resetting the coarse level temporaries and solvers
       if (coarse) {
         coarse->param.updateInvertParam(*param.mg_global.invert_param);
@@ -209,6 +194,9 @@ namespace quda
       setOutputPrefix(prefix); // restore since we just popped back from coarse grid
 
       createCoarseSolver();
+
+      // If enabled, verify the coarse links and fine solvers were correctly built
+      if (param.mg_global.run_verify) verify();
     }
 
     // We're going back up the coarse construct stack now, prefetch the gauge fields on
@@ -504,11 +492,6 @@ namespace quda
       xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new GaugeField(xinv_param)));
       xInvKD_sloppy->copy(*xInvKD);
 
-      ColorSpinorParam sloppy_tmp_param(tmp_coarse);
-      sloppy_tmp_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition);
-
-      tmp_coarse_sloppy = ColorSpinorField(sloppy_tmp_param);
-
     } else {
       // We can just alias fields
       xInvKD_sloppy = xInvKD;
@@ -777,15 +760,8 @@ namespace quda
   {
     pushLevel(param.level);
 
-    // temporary fields used for verification
-    ColorSpinorParam csParam(r);
-    csParam.create = QUDA_NULL_FIELD_CREATE;
-    ColorSpinorField tmp1(csParam);
-    ColorSpinorField tmp2(csParam);
-
-    QudaPrecision prec = (param.mg_global.precision_null[param.level] < csParam.Precision()) ?
-      param.mg_global.precision_null[param.level] :
-      csParam.Precision();
+    QudaPrecision prec = (param.mg_global.precision_null[param.level] < r.Precision()) ?
+      param.mg_global.precision_null[param.level] : r.Precision();
 
     // may want to revisit this---these were relaxed for cases where ghost_precision < precision
     // these were set while hacking in tests of quarter precision ghosts
@@ -798,28 +774,44 @@ namespace quda
     default: tol = 1e-8;
     }
 
+    // temporary fields used for verification
+    std::vector<ColorSpinorField> fine_tmp(param.Nvec);
+    ColorSpinorParam fine_param(r);
+    fine_param.create = QUDA_NULL_FIELD_CREATE;
+    for (auto &f : fine_tmp) f = ColorSpinorField(fine_param);
+
+    std::vector<ColorSpinorField> coarse_tmp(param.Nvec);
+    ColorSpinorParam coarse_param(r_coarse);
+    coarse_param.create = QUDA_NULL_FIELD_CREATE;
+    for (auto &c : coarse_tmp) c = ColorSpinorField(coarse_param);
+
+    auto &tmp1 = fine_tmp[0];
+    auto &tmp2 = fine_tmp[1];
+    auto &tmp_coarse = coarse_tmp[0];
+
     // No need to check (projector) v_k for staggered case
     if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) {
 
       logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P P^\\dagger) v_k for %d vectors\n", param.Nvec);
 
-      for (int i = 0; i < param.Nvec; i++) {
-        // as well as copying to the correct location this also changes basis if necessary
-        tmp1 = param.B[i];
+      // change fine_tmp to match B basis to allow comparison
+      for (auto &f : fine_tmp) f.GammaBasis(param.B[0].GammaBasis());
+      transfer->R(coarse_tmp, param.B);
+      transfer->P(fine_tmp, coarse_tmp);
 
-        transfer->R(r_coarse, tmp1);
-        transfer->P(tmp2, r_coarse);
-        auto max_deviation = blas::max_deviation(tmp2, tmp1);
-        auto l2_deviation = sqrt(xmyNorm(tmp1, tmp2) / norm2(tmp1));
+      for (auto i = 0; i < param.Nvec; i++) {
+        auto max_deviation = blas::max_deviation(fine_tmp[i], param.B[i]);
+        auto l2_deviation = sqrt(xmyNorm(param.B[i], fine_tmp[i]) / norm2(param.B[i]));
 
-        logQuda(
-          QUDA_VERBOSE, "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n",
-          i, norm2(tmp1), norm2(r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]);
+        logQuda(QUDA_VERBOSE,
+                "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n",
+                i, norm2(param.B[i]), norm2(coarse_tmp[i]), norm2(fine_tmp[i]), l2_deviation, max_deviation[0]);
         if (check_deviation(l2_deviation, tol))
           errorQuda("k=%d orthonormality failed: L2 relative deviation %e > %e", i, l2_deviation, tol);
         if (check_deviation(max_deviation[0], tol))
           errorQuda("k=%d orthonormality failed: max deviation %e > %e", i, max_deviation[0], tol);
       }
+      for (auto &f : fine_tmp) f.GammaBasis(r.GammaBasis()); // restore basis
 
       // the oblique check
       if (param.mg_global.run_oblique_proj_check) {
@@ -840,8 +832,8 @@ namespace quda
           logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1));
           logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i])));
         }
-        sprintf(prefix, "MG level %d (%s): ", param.level + 1,
-                param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
+
+        sprintf(prefix, "MG level %d (%s): ", param.level + 1, param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
         setOutputPrefix(prefix);
       }
     }
@@ -862,13 +854,6 @@ namespace quda
     }
 #endif
 
-    // We need to create temporary coarse vectors here for various verifications
-    // Otherwise these get created in the coarse `reset()` routine later
-
-    if (tmp_coarse.empty())
-      tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
-                                            param.mg_global.location[param.level + 1]);
-
     // create coarse residual vector if not already created in verify()
     if (r_coarse.empty())
       r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(),
@@ -1185,8 +1170,12 @@ namespace quda
 
       diracSmoother->prepare(in, out, x, b, outer_solution_type);
 
-      // b_tilde holds either a copy of preconditioned source or a pointer to original source
-      if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) b_tilde = *in;
+      ColorSpinorField b_tilde;
+      // if we're using preconditioning then allocate storage for the preconditioned source vector
+      if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) {
+        b_tilde = getFieldTmp(r.Even());
+        b_tilde = *in; // b_tilde holds either a copy of preconditioned source or a pointer to original source
+      }
 
       if (presmoother) (*presmoother)(*out, *in); else zero(*out);
 

From 3fd08a1902fa8d24a95bbe8c833e7f77fc18af9d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 13:27:43 -0800
Subject: [PATCH 15/21] Move fix to avoid copies to ColorspinorField::copy,
 insted of being in blas::copy: this remove a bunch of unnecessary copies

---
 include/blas_quda.h        | 12 +-----------
 lib/color_spinor_field.cpp |  9 ++++++++-
 lib/transfer.cpp           |  2 +-
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/include/blas_quda.h b/include/blas_quda.h
index 07b09f1209..c2090f24a2 100644
--- a/include/blas_quda.h
+++ b/include/blas_quda.h
@@ -28,17 +28,7 @@ namespace quda {
       for (auto i = 0u; i < x.size(); i++) x[i].zero();
     }
 
-    inline void copy(ColorSpinorField &dst, const ColorSpinorField &src)
-    {
-      if (dst.data() == src.data()) {
-        // check the fields are equivalent else error
-        if (ColorSpinorField::are_compatible(dst, src))
-          return;
-        else
-          errorQuda("Aliasing pointers with incompatible fields");
-      }
-      dst.copy(src);
-    }
+    inline void copy(ColorSpinorField &dst, const ColorSpinorField &src) { dst.copy(src); }
 
     /**
        @brief Apply the operation y = a * x
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 9c6c5d0272..f49e76d9cb 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -401,6 +401,12 @@ namespace quda
 
   void ColorSpinorField::copy(const ColorSpinorField &src)
   {
+    // if these are the same field then return
+    if (data() == src.data()) {
+      if (are_compatible(*this, src)) return;
+      else errorQuda("Aliasing pointers with incompatible fields");
+    }
+
     test_compatible_weak(*this, src);
 
     if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
@@ -672,7 +678,7 @@ namespace quda
 
   bool ColorSpinorField::are_compatible(const ColorSpinorField &a, const ColorSpinorField &b)
   {
-    return (a.Precision() == b.Precision() && a.FieldOrder() == b.FieldOrder() && are_compatible_weak(a, b));
+    return (a.Precision() == b.Precision() && a.FieldOrder() == b.FieldOrder() && a.GammaBasis() == b.GammaBasis() && are_compatible_weak(a, b));
   }
 
   void ColorSpinorField::test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b)
@@ -691,6 +697,7 @@ namespace quda
     test_compatible_weak(a, b);
     if (a.Precision() != b.Precision()) errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision());
     if (a.FieldOrder() != b.FieldOrder()) errorQuda("orders do not match: %d %d", a.FieldOrder(), b.FieldOrder());
+    if (a.GammaBasis() != b.GammaBasis()) errorQuda("basis does not match: %d %d", a.GammaBasis(), b.GammaBasis());
   }
 
   const ColorSpinorField &ColorSpinorField::Even() const
diff --git a/lib/transfer.cpp b/lib/transfer.cpp
index fc48695190..3e0735b86c 100644
--- a/lib/transfer.cpp
+++ b/lib/transfer.cpp
@@ -397,7 +397,7 @@ namespace quda {
 
       }
 
-      for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not?
+      for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically)
 
       if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET)
         errorQuda("Cannot prolongate to a full field since only have single parity null-space components");

From ade7923684748a454e2d22c0fe22ab0720855d04 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 13:47:42 -0800
Subject: [PATCH 16/21] Fix restrictor compilation with clang

---
 include/kernels/restrictor.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index aee1601915..7ca4985b0f 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -105,7 +105,7 @@ namespace quda {
       int i = coarse_color_block + coarse_color_local;
 
       ColorSpinor<typename Arg::real, Arg::fineColor, Arg::fineSpin> in;
-      arg.in[src_idx].load<Arg::fineSpin>(in.data, spinor_parity, x_cb);
+      arg.in[src_idx].template load<Arg::fineSpin>(in.data, spinor_parity, x_cb);
 
       if constexpr (Arg::fineSpin == 4 && Arg::from_non_rel) {
         in.toRel();

From e982208eafeb681deecb7cc37e4c8a6e70296a5d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 14:30:48 -0800
Subject: [PATCH 17/21] Increase m-rhs limit to 96 for prolongator and
 restrictor

---
 include/kernels/prolongator.cuh | 2 +-
 include/kernels/restrictor.cuh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh
index 515c409e2c..f9175e3901 100644
--- a/include/kernels/prolongator.cuh
+++ b/include/kernels/prolongator.cuh
@@ -29,7 +29,7 @@ namespace quda {
     using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
     using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat, vFloat>;
 
-    static constexpr unsigned int max_n_src = 64;
+    static constexpr unsigned int max_n_src = 96;
     const int_fastdiv n_src;
     F out[max_n_src];
     C in[max_n_src];
diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index 7ca4985b0f..4b30874d35 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -35,7 +35,7 @@ namespace quda {
     using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
     using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat>;
 
-    static constexpr unsigned int max_n_src = 64;
+    static constexpr unsigned int max_n_src = 96;
     const int_fastdiv n_src;
 
     C out[max_n_src];

From 984146ed401aebe7297cf8e739fbff5f82a64b07 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 17:13:49 -0800
Subject: [PATCH 18/21] Fix spinor_dilute warning that was missed before

---
 lib/spinor_dilute.in.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu
index eadd519c5b..edc28e6267 100644
--- a/lib/spinor_dilute.in.cu
+++ b/lib/spinor_dilute.in.cu
@@ -91,7 +91,8 @@ namespace quda
         SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
       } else {
         errorQuda(
-          "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)");
+          "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)",
+          src.Ncolor());
       }
     } else {
       if constexpr (sizeof...(N) > 0)

From e3cf139ad1646bdd58fcae9d69db444c706bd1af Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 17:16:07 -0800
Subject: [PATCH 19/21] Add batching support to prolongator, restrictor and
 dslash coarse.  Max single kernel size is now set by a CMake paramter
 QUDA_MAX_MULTI_RHS, which defaults to 32

---
 CMakeLists.txt                    |  4 ++++
 include/kernels/dslash_coarse.cuh |  2 +-
 include/kernels/prolongator.cuh   |  2 +-
 include/kernels/restrictor.cuh    |  2 +-
 include/quda_define.h.in          |  7 +++++++
 lib/dslash_coarse.in.cpp          | 15 +++++++++++++--
 lib/prolongator.in.cpp            |  9 ++++++++-
 lib/restrictor.in.cpp             |  9 ++++++++-
 8 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e22f66d9cb..03ea9f4fba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,9 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32)
   message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.")
 endif()
 
+# Set the maximum multi-RHS per kernel
+set(QUDA_MAX_MULTI_RHS "32" CACHE STRING "maximum number of simultaneous RHS in a kernel")
+
 set(QUDA_PRECISION
   "14"
   CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)")
@@ -275,6 +278,7 @@ mark_as_advanced(QUDA_FAST_COMPILE_DSLASH)
 mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F)
 
 mark_as_advanced(QUDA_MAX_MULTI_BLAS_N)
+mark_as_advanced(QUDA_MAX_MULTI_RHS)
 mark_as_advanced(QUDA_PRECISION)
 mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 744a74f874..531ad18ebe 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -52,7 +52,7 @@ namespace quda {
     using F = typename colorspinor::FieldOrderCB<real, nSpin, nColor, 1, csOrder, Float, ghostFloat, true>;
     using GY = typename gauge::FieldOrder<real, nColor * nSpin, nSpin, gOrder, true, yFloat>;
 
-    static constexpr unsigned int max_n_src = 64;
+    static constexpr unsigned int max_n_src = MAX_MULTI_RHS;
     const int_fastdiv n_src;
     F out[max_n_src];
     F inA[max_n_src];
diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh
index f9175e3901..046037e24b 100644
--- a/include/kernels/prolongator.cuh
+++ b/include/kernels/prolongator.cuh
@@ -29,7 +29,7 @@ namespace quda {
     using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
     using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat, vFloat>;
 
-    static constexpr unsigned int max_n_src = 96;
+    static constexpr unsigned int max_n_src = MAX_MULTI_RHS;
     const int_fastdiv n_src;
     F out[max_n_src];
     C in[max_n_src];
diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index 4b30874d35..e8c3662c7f 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -35,7 +35,7 @@ namespace quda {
     using C = FieldOrderCB<Float, coarseSpin, coarseColor, 1, colorspinor::getNative<Float>(coarseSpin), Float, Float, true>;
     using V = FieldOrderCB<Float, fineSpin, fineColor, coarseColor, colorspinor::getNative<vFloat>(fineSpin), vFloat>;
 
-    static constexpr unsigned int max_n_src = 96;
+    static constexpr unsigned int max_n_src = MAX_MULTI_RHS;
     const int_fastdiv n_src;
 
     C out[max_n_src];
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 850c3990df..445d3cd544 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -18,6 +18,13 @@
  */
 #define MAX_MULTI_BLAS_N @QUDA_MAX_MULTI_BLAS_N@
 
+/**
+ * @def   MAX_MULTI_BLAS_N
+ * @brief This macro sets the limit of blas fusion in the multi-blas
+ * and multi-reduce kernels
+ */
+#define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@
+
 #cmakedefine QUDA_HETEROGENEOUS_ATOMIC
 #ifdef QUDA_HETEROGENEOUS_ATOMIC
 /**
diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp
index d2a51102e8..c90b119b42 100644
--- a/lib/dslash_coarse.in.cpp
+++ b/lib/dslash_coarse.in.cpp
@@ -103,8 +103,19 @@ namespace quda
   {
     if constexpr (is_enabled_multigrid()) {
       if (!DiracCoarse::apply_mma(out, use_mma) || checkLocation(Y, X) == QUDA_CPU_FIELD_LOCATION) {
-        ApplyCoarse<false>(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, commDim, halo_precision,
-                           IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
+
+        for (auto i = 0u; i < inA.size(); i += MAX_MULTI_RHS) { // batching if needed
+          auto inA_begin = inA.begin() + i;
+          auto inA_end = std::min(inA.begin() + (i + MAX_MULTI_RHS), inA.end());
+          auto inB_begin = inB.begin() + i;
+          auto inB_end = std::min(inB.begin() + (i + MAX_MULTI_RHS), inB.end());
+          auto out_begin = out.begin() + i;
+          auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end());
+
+          ApplyCoarse<false>({out_begin, out_end}, {inA_begin, inA_end}, {inB_begin, inB_end},
+                             Y, X, kappa, parity, dslash, clover, dagger, commDim, halo_precision,
+                             IntList<@QUDA_MULTIGRID_NVEC_LIST@>());
+        }
       } else {
         constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
         ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder);
diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp
index 286c18ba11..277d5c69f6 100644
--- a/lib/prolongator.in.cpp
+++ b/lib/prolongator.in.cpp
@@ -54,7 +54,14 @@ namespace quda
       // clang-format off
       IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors;
       // clang-format on
-      Prolongate(out, in, v, fine_to_coarse, spin_map, parity, fineColors);
+
+      for (auto i = 0u; i < in.size(); i += MAX_MULTI_RHS) { // batching if needed
+        auto in_begin = in.begin() + i;
+        auto in_end = std::min(in.begin() + (i + MAX_MULTI_RHS), in.end());
+        auto out_begin = out.begin() + i;
+        auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end());
+        Prolongate({out_begin, out_end}, {in_begin, in_end}, v, fine_to_coarse, spin_map, parity, fineColors);
+      }
     } else {
       errorQuda("Multigrid has not been built");
     }
diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp
index ca40557846..3f12e4e380 100644
--- a/lib/restrictor.in.cpp
+++ b/lib/restrictor.in.cpp
@@ -54,7 +54,14 @@ namespace quda
       // clang-format off
       IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors;
       // clang-format on
-      Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, fineColors);
+
+      for (auto i = 0u; i < in.size(); i += MAX_MULTI_RHS) {  // batching if needed
+        auto in_begin = in.begin() + i;
+        auto in_end = std::min(in.begin() + (i + MAX_MULTI_RHS), in.end());
+        auto out_begin = out.begin() + i;
+        auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end());
+        Restrict({out_begin, out_end}, {in_begin, in_end}, v, fine_to_coarse, coarse_to_fine, spin_map, parity, fineColors);
+      }
     } else {
       errorQuda("Multigrid has not been built");
     }

From 5e2385007a1d867e6f5423386ae0c79e158767ea Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 17:16:45 -0800
Subject: [PATCH 20/21] Disable swizzle with restrictor when doing multi-RHS,
 since it doesn't do anything in this case except make the tuning longer

---
 lib/restrictor.in.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index 184cfca15f..595694873d 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -72,7 +72,7 @@ namespace quda {
 
     bool advanceAux(TuneParam &param) const
     {
-      if (Arg<false>::swizzle) {
+      if (Arg<false>::swizzle && in.size() == 1) {
         if (param.aux.x < 2 * (int)device::processor_count()) {
           param.aux.x++;
           return true;

From a6aafaff3fd7ed7d4694b78afd1e857fc0dee2f1 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 31 Jan 2024 17:23:30 -0800
Subject: [PATCH 21/21] On second thoughts, keep the swizzle tuning for small
 RHS count

---
 lib/restrictor.in.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index 595694873d..c25d8a5701 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -72,7 +72,7 @@ namespace quda {
 
     bool advanceAux(TuneParam &param) const
     {
-      if (Arg<false>::swizzle && in.size() == 1) {
+      if (Arg<false>::swizzle && in.size() < 8) {
         if (param.aux.x < 2 * (int)device::processor_count()) {
           param.aux.x++;
           return true;