diff --git a/include/clover_field.h b/include/clover_field.h
index 380a399492..1bba0359a6 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -7,6 +7,19 @@
 
 namespace quda {
 
+  /**
+     @brief Helper function that returns whether we have enabled
+     clover fermions.
+   */
+  constexpr bool is_enabled_clover()
+  {
+#ifdef GPU_CLOVER_DIRAC
+    return true;
+#else
+    return false;
+#endif
+  }
+
   namespace clover
   {
 
@@ -463,6 +476,29 @@ namespace quda {
   */
   void cloverInvert(CloverField &clover, bool computeTraceLog);
 
+  /**
+     @brief Driver for the clover force computation.  Eventually the
+     construction of the x and p fields will be delegated to this
+     function, but for now, we pre-compute these and pass them in.
+     @param mom[in,out] Momentum field to be updates
+     @param gaugeEx[in] Extended gauge field
+     @param gauge[in] Gauge field
+     @param clover[in] Clover field
+     @param x[in] Vector of quark solution fields
+     @param x0[in] Vector of auxilary quark fields for determinant ratio
+     @param coeff[in] Vector of coefficients for the quark field outer
+     products
+     @param epsilon[in] Vector of scalar coefficient pairs (one per
+     parity) for the clover sigma outer product
+     @param sigma_coeff[in] Coefficient for the tr log clover force
+     @param detratio[in] Whether to compute determinant ratio
+     @param parity[in] Which parity do we need compute the tr log clover force
+  */
+  void computeCloverForce(GaugeField &mom, const GaugeField &gaugeEx, const GaugeField &gauge,
+                          const CloverField &clover, cvector_ref<ColorSpinorField> &x, cvector_ref<ColorSpinorField> &x0,
+                          const std::vector<double> &coeff, const std::vector<array<double, 2>> &epsilon,
+                          double sigma_coeff, bool detratio, QudaInvertParam &param);
+
   /**
      @brief Compute the force contribution from the solver solution fields
 
@@ -480,9 +516,8 @@ namespace quda {
      @param p Intermediate vectors (both parities)
      @param coeff Multiplicative coefficient (e.g., dt * residue)
    */
-  void computeCloverForce(GaugeField& force, const GaugeField& U,
-			  std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &p,
-			  std::vector<double> &coeff);
+  void computeCloverForce(GaugeField &force, const GaugeField &U, cvector_ref<const ColorSpinorField> &x,
+                          cvector_ref<const ColorSpinorField> &p, const std::vector<double> &coeff);
   /**
      @brief Compute the outer product from the solver solution fields
      arising from the diagonal term of the fermion bilinear in
@@ -493,10 +528,8 @@ namespace quda {
      @param p[in] Intermediate vectors (both parities)
      @coeff coeff[in] Multiplicative coefficient (e.g., dt * residiue), one for each parity
   */
-  void computeCloverSigmaOprod(GaugeField& oprod,
-			       std::vector<ColorSpinorField*> &x,
-			       std::vector<ColorSpinorField*> &p,
-			       std::vector< std::vector<double> > &coeff);
+  void computeCloverSigmaOprod(GaugeField &oprod, cvector_ref<const ColorSpinorField> &x,
+                               cvector_ref<const ColorSpinorField> &p, const std::vector<array<double, 2>> &coeff);
   /**
      @brief Compute the matrix tensor field necessary for the force calculation from
      the clover trace action.  This computes a tensor field [mu,nu].
@@ -504,8 +537,9 @@ namespace quda {
      @param output The computed matrix field (tensor matrix field)
      @param clover The input clover field
      @param coeff  Scalar coefficient multiplying the result (e.g., stepsize)
+     @param parity The field parity we are working on
    */
-  void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff);
+  void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff, int parity);
 
   /**
      @brief Compute the derivative of the clover matrix in the direction
@@ -516,9 +550,8 @@ namespace quda {
      @param gauge The input gauge field
      @param oprod The input outer-product field (tensor matrix field)
      @param coeff Multiplicative coefficient (e.g., clover coefficient)
-     @param parity The field parity we are working on
    */
-  void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, QudaParity parity);
+  void cloverDerivative(GaugeField &force, const GaugeField &gauge, const GaugeField &oprod, double coeff);
 
   /**
     @brief This function is used for copying from a source clover field to a destination clover field
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 4801bdbf48..f6fc42d194 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -333,7 +333,7 @@ namespace quda
     size_t norm_offset = 0; /** offset to the norm (if applicable) */
 
     // multi-GPU parameters
-    array_2d<void *, 2, QUDA_MAX_DIM> ghost = {};          // pointers to the ghost regions - NULL by default
+    mutable array_2d<void *, 2, QUDA_MAX_DIM> ghost = {};  // pointers to the ghost regions - NULL by default
     mutable lat_dim_t ghostFace = {};                      // the size of each face
     mutable lat_dim_t ghostFaceCB = {};                    // the size of each checkboarded face
     mutable array<void *, 2 *QUDA_MAX_DIM> ghost_buf = {}; // wrapper that points to current ghost zone
@@ -510,7 +510,7 @@ namespace quda
        @param[in] nFace Depth of each halo
        @param[in] spin_project Whether the halos are spin projected (Wilson-type fermions only)
     */
-    void createComms(int nFace, bool spin_project = true);
+    void createComms(int nFace, bool spin_project = true) const;
 
     /**
        @brief Packs the ColorSpinorField's ghost zone
@@ -530,7 +530,7 @@ namespace quda
       */
     void packGhost(const int nFace, const QudaParity parity, const int dagger, const qudaStream_t &stream,
                    MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label, bool spin_project,
-                   double a = 0, double b = 0, double c = 0, int shmem = 0);
+                   double a = 0, double b = 0, double c = 0, int shmem = 0) const;
 
     /**
        Pack the field halos in preparation for halo exchange, e.g., for Dslash
@@ -550,7 +550,7 @@ namespace quda
     */
     void pack(int nFace, int parity, int dagger, const qudaStream_t &stream, MemoryLocation location[2 * QUDA_MAX_DIM],
               MemoryLocation location_label, bool spin_project = true, double a = 0, double b = 0, double c = 0,
-              int shmem = 0);
+              int shmem = 0) const;
 
     /**
       @brief Initiate the gpu to cpu send of the ghost zone (halo)
@@ -559,7 +559,7 @@ namespace quda
       @param dir The direction (QUDA_BACKWARDS or QUDA_FORWARDS)
       @param stream The array of streams to use
       */
-    void sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir, const qudaStream_t &stream);
+    void sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir, const qudaStream_t &stream) const;
 
     /**
       Initiate the cpu to gpu send of the ghost zone (halo)
@@ -568,7 +568,7 @@ namespace quda
       @param dir The direction (QUDA_BACKWARDS or QUDA_FORWARDS)
       @param stream The array of streams to use
       */
-    void unpackGhost(const void *ghost_spinor, const int dim, const QudaDirection dir, const qudaStream_t &stream);
+    void unpackGhost(const void *ghost_spinor, const int dim, const QudaDirection dir, const qudaStream_t &stream) const;
 
     /**
        @brief Copies the ghost to the host from the device, prior to
@@ -577,7 +577,7 @@ namespace quda
        the scatter-centric direction (0=backwards,1=forwards)
        @param[in] stream The stream in which to do the copy
      */
-    void gather(int dir, const qudaStream_t &stream);
+    void gather(int dir, const qudaStream_t &stream) const;
 
     /**
        @brief Initiate halo communication receive
@@ -585,7 +585,7 @@ namespace quda
        the scatter-centric direction (0=backwards,1=forwards)
        @param[in] gdr Whether we are using GDR on the receive side
     */
-    void recvStart(int dir, const qudaStream_t &stream, bool gdr = false);
+    void recvStart(int dir, const qudaStream_t &stream, bool gdr = false) const;
 
     /**
        @brief Initiate halo communication sending
@@ -596,7 +596,7 @@ namespace quda
        @param[in] gdr Whether we are using GDR on the send side
        @param[in] remote_write Whether we are writing direct to remote memory (or using copy engines)
     */
-    void sendStart(int d, const qudaStream_t &stream, bool gdr = false, bool remote_write = false);
+    void sendStart(int d, const qudaStream_t &stream, bool gdr = false, bool remote_write = false) const;
 
     /**
        @brief Initiate halo communication
@@ -606,7 +606,7 @@ namespace quda
        @param[in] gdr_send Whether we are using GDR on the send side
        @param[in] gdr_recv Whether we are using GDR on the receive side
     */
-    void commsStart(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false);
+    void commsStart(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false) const;
 
     /**
        @brief Non-blocking query if the halo communication has completed
@@ -616,7 +616,7 @@ namespace quda
        @param[in] gdr_send Whether we are using GDR on the send side
        @param[in] gdr_recv Whether we are using GDR on the receive side
     */
-    int commsQuery(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false);
+    int commsQuery(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false) const;
 
     /**
        @brief Wait on halo communication to complete
@@ -626,7 +626,7 @@ namespace quda
        @param[in] gdr_send Whether we are using GDR on the send side
        @param[in] gdr_recv Whether we are using GDR on the receive side
     */
-    void commsWait(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false);
+    void commsWait(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false) const;
 
     /**
        @brief Unpacks the ghost from host to device after
@@ -636,7 +636,7 @@ namespace quda
        @param[in] stream The stream in which to do the copy.  If
        -1 is passed then the copy will be issied to the d^th stream
      */
-    void scatter(int d, const qudaStream_t &stream);
+    void scatter(int d, const qudaStream_t &stream) const;
 
     /**
        Do the exchange between neighbouring nodes of the data in
@@ -725,6 +725,9 @@ namespace quda
     ColorSpinorField &Even();
     ColorSpinorField &Odd();
 
+    const ColorSpinorField &operator[](QudaParity parity) const { return parity == QUDA_EVEN_PARITY ? Even() : Odd(); }
+    ColorSpinorField &operator[](QudaParity parity) { return parity == QUDA_EVEN_PARITY ? Even() : Odd(); }
+
     CompositeColorSpinorField &Components() { return components; };
 
     /**
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 948960b36f..ffea94c22b 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -711,7 +711,7 @@ namespace quda {
      @param recon The reconsturction type
      @return the pointer to the extended gauge field
   */
-  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
+  GaugeField *createExtendedGauge(const GaugeField &in, const lat_dim_t &R, TimeProfile &profile = getProfile(),
                                   bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID);
 
   /**
diff --git a/include/kernels/clover_deriv.cuh b/include/kernels/clover_deriv.cuh
index 38ab548766..59213a7524 100644
--- a/include/kernels/clover_deriv.cuh
+++ b/include/kernels/clover_deriv.cuh
@@ -7,7 +7,8 @@
 namespace quda
 {
 
-  template <typename Float, QudaReconstructType recon> struct CloverDerivArg : kernel_param<> {
+  template <typename Float, int nColor_, QudaReconstructType recon> struct CloverDerivArg : kernel_param<> {
+    static constexpr int nColor = nColor_;
     using Force = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type;
     using Oprod = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type;
     using Gauge = typename gauge_mapper<Float, recon>::type;
@@ -16,19 +17,13 @@ namespace quda
     int E[4];
     int border[4];
     real coeff;
-    int parity;
 
     Force force;
     Gauge gauge;
     Oprod oprod;
 
-    CloverDerivArg(const GaugeField &force, const GaugeField &gauge, const GaugeField &oprod, double coeff, int parity) :
-      kernel_param(dim3(force.VolumeCB(), 2, 4)),
-      coeff(coeff),
-      parity(parity),
-      force(force),
-      gauge(gauge),
-      oprod(oprod)
+    CloverDerivArg(GaugeField &force, const GaugeField &gauge, const GaugeField &oprod, double coeff) :
+      kernel_param(dim3(force.VolumeCB(), 2, 4)), coeff(coeff), force(force), gauge(gauge), oprod(oprod)
     {
       for (int dir = 0; dir < 4; ++dir) {
         X[dir] = force.X()[dir];
@@ -39,168 +34,167 @@ namespace quda
   };
 
   using computeForceOps = SpecialOps<thread_array<int, 4>>;
-  template <typename Link, typename Ftor>
-  __device__ __host__ void computeForce(Link &force_total, const Ftor &ftor, int xIndex, int yIndex, int mu, int nu)
+  template <typename Link, typename Force, typename Ftor>
+  __device__ __host__ void computeForce(Force &force_total, const Ftor &ftor, int xIndex, int parity, int mu, int nu)
   {
     const auto &arg = ftor.arg;
-    const int otherparity = (1 - arg.parity);
+    const int otherparity = (1 - parity);
     const int tidx = mu > nu ? (mu - 1) * mu / 2 + nu : (nu - 1) * nu / 2 + mu;
 
-    if (yIndex == 0) { // do "this" force
+    int x[4];
+    getCoordsExtended(x, xIndex, arg.X, parity, arg.border);
 
-      int x[4];
-      getCoordsExtended(x, xIndex, arg.X, arg.parity, arg.border);
-
-      // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
-      {
-        thread_array<int, 4> d{ftor};
-
-        // load U(x)_(+mu)
-        Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
-
-        // load U(x+mu)_(+nu)
-        d[mu]++;
-        Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
-        d[mu]--;
-
-        // load U(x+nu)_(+mu)
-        d[nu]++;
-        Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
-        d[nu]--;
-
-        // load U(x)_(+nu)
-        Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
-
-        // load Oprod
-        Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
-        Link force = U1 * U2 * conj(U3) * conj(U4) * Oprod1;
-
-        d[mu]++;
-        d[nu]++;
-        Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
-        force += U1 * U2 * Oprod2 * conj(U3) * conj(U4);
-
-        if (nu < mu) force_total -= force;
-        else force_total += force;
-      }
-
-      {
-        thread_array<int, 4> d{ftor};
-
-        // load U(x-nu)(+nu)
-        d[nu]--;
-        Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
-        d[nu]++;
-
-        // load U(x-nu)(+mu)
-        d[nu]--;
-        Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
-        d[nu]++;
-
-        // load U(x+mu-nu)(nu)
-        d[mu]++;
-        d[nu]--;
-        Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
-        d[mu]--;
-        d[nu]++;
-
-        // load U(x)_(+mu)
-        Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
-
-        d[mu]++;
-        d[nu]--;
-        Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
-        Link force = conj(U1) * U2 * Oprod1 * U3 * conj(U4);
-
-        d[mu]--;
-        d[nu]++;
-        Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
-        force += Oprod4 * conj(U1) * U2 * U3 * conj(U4);
-
-        if (nu < mu) force_total += force;
-        else force_total -= force;
-      }
-
-    } else { // else do other force
-
-      int y[4] = { };
-      getCoordsExtended(y, xIndex, arg.X, otherparity, arg.border);
+    // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
+    {
+      thread_array<int, 4> d{ftor};
 
-      {
-        thread_array<int, 4> d{ftor};
+      // load U(x)_(+mu)
+      Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), parity);
 
-        // load U(x)_(+mu)
-        Link U1 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
+      // load U(x+mu)_(+nu)
+      d[mu]++;
+      Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      d[mu]--;
 
-        // load U(x+mu)_(+nu)
-        d[mu]++;
-        Link U2 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
-        d[mu]--;
+      // load U(x+nu)_(+mu)
+      d[nu]++;
+      Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]--;
 
-        // load U(x+nu)_(+mu)
-        d[nu]++;
-        Link U3 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
-        d[nu]--;
+      // load U(x)_(+nu)
+      Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), parity);
 
-        // load U(x)_(+nu)
-        Link U4 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
+      // load Oprod
+      Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), parity);
+      Link force = U1 * U2 * conj(U3) * conj(U4) * Oprod1;
 
-        // load opposite parity Oprod
-        d[nu]++;
-        Link Oprod3 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
-        Link force = U1 * U2 * conj(U3) * Oprod3 * conj(U4);
+      d[mu]++;
+      d[nu]++;
+      Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), parity);
+      force += U1 * U2 * Oprod2 * conj(U3) * conj(U4);
 
-        // load Oprod(x+mu)
-        d[nu]--;
-        d[mu]++;
-        Link Oprod4 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
-        force += U1 * Oprod4 * U2 * conj(U3) * conj(U4);
+      if (nu < mu)
+        force_total -= force;
+      else
+        force_total += force;
+    }
 
-        if (nu < mu) force_total -= force;
-        else force_total += force;
-      }
+    {
+      thread_array<int, 4> d{ftor};
+
+      // load U(x-nu)(+nu)
+      d[nu]--;
+      Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]++;
+
+      // load U(x-nu)(+mu)
+      d[nu]--;
+      Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      d[mu]++;
+      d[nu]--;
+      Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), parity);
+      d[mu]--;
+      d[nu]++;
+
+      // load U(x)_(+mu)
+      Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), parity);
+
+      d[mu]++;
+      d[nu]--;
+      Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), parity);
+      Link force = conj(U1) * U2 * Oprod1 * U3 * conj(U4);
+
+      d[mu]--;
+      d[nu]++;
+      Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), parity);
+      force += Oprod4 * conj(U1) * U2 * U3 * conj(U4);
+
+      if (nu < mu)
+        force_total += force;
+      else
+        force_total -= force;
+    }
 
-      // Lower leaf
-      // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
-      {
-        thread_array<int, 4> d{ftor};
-
-        // load U(x-nu)(+nu)
-        d[nu]--;
-        Link U1 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
-        d[nu]++;
-
-        // load U(x-nu)(+mu)
-        d[nu]--;
-        Link U2 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
-        d[nu]++;
-
-        // load U(x+mu-nu)(nu)
-        d[mu]++;
-        d[nu]--;
-        Link U3 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
-        d[mu]--;
-        d[nu]++;
-
-        // load U(x)_(+mu)
-        Link U4 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
-
-        // load Oprod(x+mu)
-        d[mu]++;
-        Link Oprod1 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
-        Link force = conj(U1) * U2 * U3 * Oprod1 * conj(U4);
-
-        d[mu]--;
-        d[nu]--;
-        Link Oprod2 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
-        force += conj(U1) * Oprod2 * U2 * U3 * conj(U4);
-
-        if (nu < mu) force_total += force;
-        else force_total -= force;
-      }
+    {
+      thread_array<int, 4> d{ftor};
+
+      // load U(x)_(+mu)
+      Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), parity);
+
+      // load U(x+mu)_(+nu)
+      d[mu]++;
+      Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      d[mu]--;
+
+      // load U(x+nu)_(+mu)
+      d[nu]++;
+      Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]--;
+
+      // load U(x)_(+nu)
+      Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), parity);
+
+      // load opposite parity Oprod
+      d[nu]++;
+      Link Oprod3 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), otherparity);
+      Link force = U1 * U2 * conj(U3) * Oprod3 * conj(U4);
+
+      // load Oprod(x+mu)
+      d[nu]--;
+      d[mu]++;
+      Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), otherparity);
+      force += U1 * Oprod4 * U2 * conj(U3) * conj(U4);
+
+      if (nu < mu)
+        force_total -= force;
+      else
+        force_total += force;
     }
 
-  } // namespace quda
+    // Lower leaf
+    // U[nu*](x-nu) U[mu](x-nu) U[nu](x+mu-nu) Oprod(x+mu) U[*mu](x)
+    {
+      thread_array<int, 4> d{ftor};
+
+      // load U(x-nu)(+nu)
+      d[nu]--;
+      Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]++;
+
+      // load U(x-nu)(+mu)
+      d[nu]--;
+      Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      d[mu]++;
+      d[nu]--;
+      Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), parity);
+      d[mu]--;
+      d[nu]++;
+
+      // load U(x)_(+mu)
+      Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), parity);
+
+      // load Oprod(x+mu)
+      d[mu]++;
+      Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), otherparity);
+      Link force = conj(U1) * U2 * U3 * Oprod1 * conj(U4);
+
+      d[mu]--;
+      d[nu]--;
+      Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), otherparity);
+      force += conj(U1) * Oprod2 * U2 * U3 * conj(U4);
+
+      if (nu < mu)
+        force_total += force;
+      else
+        force_total -= force;
+    }
+  }
 
   template <typename Arg> struct CloverDerivative : computeForceOps
   {
@@ -213,20 +207,19 @@ namespace quda
     {
       using real = typename Arg::real;
       using Complex = complex<real>;
-      using Link = Matrix<Complex, 3>;
+      using Link = Matrix<Complex, Arg::nColor>;
 
       Link force;
 
-#pragma unroll
       for (int nu = 0; nu < 4; nu++) {
         if (nu == mu) continue;
-        computeForce(force, *this, x_cb, parity, mu, nu);
+        computeForce<Link>(force, *this, x_cb, parity, mu, nu);
       }
 
       // Write to array
-      Link F = arg.force(mu, x_cb, parity == 0 ? arg.parity : 1 - arg.parity);
-      F += arg.coeff * force;
-      arg.force(mu, x_cb, parity == 0 ? arg.parity : 1 - arg.parity) = F;
+      Link F = arg.force(mu, x_cb, parity);
+      F += arg.coeff * static_cast<Link>(force);
+      arg.force(mu, x_cb, parity) = F;
     }
   };
 
diff --git a/include/kernels/clover_outer_product.cuh b/include/kernels/clover_outer_product.cuh
index a915521e54..70e505b149 100644
--- a/include/kernels/clover_outer_product.cuh
+++ b/include/kernels/clover_outer_product.cuh
@@ -32,8 +32,9 @@ namespace quda {
     real coeff;
 
     CloverForceArg(GaugeField &force, const GaugeField &U, const ColorSpinorField &inA, const ColorSpinorField &inB,
-                   const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity, const double coeff) :
-      kernel_param(dim3(dim == -1 ? inA.VolumeCB() : inA.GhostFaceCB()[dim])),
+                   const ColorSpinorField &inC, const ColorSpinorField &inD, const unsigned int parity,
+                   const double coeff) :
+      kernel_param(dim3(dim == -1 ? inA.VolumeCB() : inB.GhostFaceCB()[dim])), // inB since it has a ghost allocated
       force(force),
       inA(inA),
       inB(inB),
@@ -46,6 +47,21 @@ namespace quda {
     {
       for (int i=0; i<4; ++i) this->X[i] = U.X()[i];
       for (int i=0; i<4; ++i) this->partitioned[i] = commDimPartitioned(i) ? true : false;
+
+      // need to reset the ghost pointers since default ghost_offset
+      // (Ghost() method) not set (this is temporary work around)
+      void *ghost[8] = {};
+      for (int dim = 0; dim < 4; dim++) {
+        for (int dir = 0; dir < 2; dir++) { ghost[2 * dim + dir] = (char *)inB.Ghost2() + inB.GhostOffset(dim, dir); }
+      }
+      this->inB.resetGhost(ghost);
+      inD.bufferIndex = (1 - inD.bufferIndex);
+
+      for (int dim = 0; dim < 4; dim++) {
+        for (int dir = 0; dir < 2; dir++) { ghost[2 * dim + dir] = (char *)inD.Ghost2() + inD.GhostOffset(dim, dir); }
+      }
+      this->inD.resetGhost(ghost);
+      inD.bufferIndex = (1 - inD.bufferIndex);
     }
   };
 
diff --git a/include/kernels/clover_sigma_outer_product.cuh b/include/kernels/clover_sigma_outer_product.cuh
index 91d2de8ef5..e4b901cdc1 100644
--- a/include/kernels/clover_sigma_outer_product.cuh
+++ b/include/kernels/clover_sigma_outer_product.cuh
@@ -21,21 +21,18 @@ namespace quda
     using F = typename colorspinor_mapper<Float, nSpin, nColor>::type;
 
     Oprod oprod;
-    const F inA[nvector];
-    const F inB[nvector];
-    real coeff[nvector][2];
+    F inA[nvector];
+    F inB[nvector];
+    array_2d<real, nvector, 2> coeff;
 
-    CloverSigmaOprodArg(GaugeField &oprod, const std::vector<ColorSpinorField*> &inA,
-                        const std::vector<ColorSpinorField*> &inB,
-                        const std::vector<std::vector<double>> &coeff_) :
-      kernel_param(dim3(oprod.VolumeCB(), 2, 6)),
-      oprod(oprod),
-      inA{*inA[0]},
-      inB{*inB[0]}
+    CloverSigmaOprodArg(GaugeField &oprod, cvector_ref<const ColorSpinorField> &inA,
+                        cvector_ref<const ColorSpinorField> &inB, const std::vector<array<double, 2>> &coeff_) :
+      kernel_param(dim3(oprod.VolumeCB(), 2, 6)), oprod(oprod)
     {
       for (int i = 0; i < nvector; i++) {
-        coeff[i][0] = coeff_[i][0];
-        coeff[i][1] = coeff_[i][1];
+        this->inA[i] = inA[i];
+        this->inB[i] = inB[i];
+        coeff[i] = {static_cast<real>(coeff_[i][0]), static_cast<real>(coeff_[i][1])};
       }
     }
   };
diff --git a/include/kernels/clover_trace.cuh b/include/kernels/clover_trace.cuh
index d7d7370fe3..d5f7514489 100644
--- a/include/kernels/clover_trace.cuh
+++ b/include/kernels/clover_trace.cuh
@@ -4,148 +4,157 @@
 #include <gauge_field_order.h>
 #include <clover_field_order.h>
 #include <kernel.h>
+#include <linalg.cuh>
 
 namespace quda {
 
-  template <typename Float, int nColor_>
-  struct CloverTraceArg : kernel_param<> {
+  template <typename Float, int nColor_, bool twist_> struct CloverTraceArg : kernel_param<> {
     using real = typename mapper<Float>::type;
+    static constexpr bool twist = twist_;
     static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 4;
+    static constexpr bool dynamic_clover = clover::dynamic_inverse();
     using C = typename clover_mapper<Float>::type;
     using G = typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type;
     G output;
-    const C clover1;
-    const C clover2;
+    const C clover;
+    const C clover_inv;
     real coeff;
+    real mu2_minus_epsilon2;
+    const int parity;
 
-    CloverTraceArg(GaugeField& output, const CloverField& clover, double coeff) :
-      kernel_param(dim3(clover.VolumeCB(), 1, 1)),
+    CloverTraceArg(GaugeField &output, const CloverField &clover, double coeff, int parity) :
+      kernel_param(dim3(output.VolumeCB(), 1, 1)),
       output(output),
-      clover1(clover, 0),
-      clover2(clover, 1),
-      coeff(coeff) {}
+      clover(clover, false),
+      clover_inv(clover, dynamic_clover ? false : true),
+      coeff(coeff),
+      mu2_minus_epsilon2(clover.Mu2() - clover.Epsilon2()),
+      parity(parity)
+    {
+    }
   };
 
-  template <typename Arg>
-  __device__ __host__ void cloverSigmaTraceCompute(const Arg &arg, const int x, int parity)
+  template <typename Arg> __device__ __host__ inline void cloverSigmaTraceCompute(const Arg &arg, const int x)
   {
+    using namespace linalg; // for Cholesky
     using real = typename Arg::real;
-    real A[72];
-    if (parity==0) arg.clover1.load(A,x,parity);
-    else arg.clover2.load(A,x,parity);
+    constexpr int N = Arg::nColor;
+    using Mat = HMatrix<real, N * Arg::nSpin / 2>;
+    Mat A[2];
 
     // load the clover term into memory
-    for (int mu=0; mu<4; mu++) {
-      for (int nu=0; nu<mu; nu++) {
+#pragma unroll
+    for (int ch = 0; ch < 2; ch++) {
+      A[ch] = arg.clover_inv(x, arg.parity, ch);
+      A[ch] *= static_cast<real>(2.0); // factor of two is inherent to QUDA clover storage
+
+      if constexpr (Arg::dynamic_clover) {
+        if constexpr (Arg::twist) { // Compute (T^2 + mu2 - epsilon2) first, then invert
+          A[ch] = A[ch].square();
+          A[ch] += arg.mu2_minus_epsilon2;
+        }
 
-        Matrix<complex<real>, Arg::nColor> mat;
-        setZero(&mat);
+        // compute the Cholesky decomposition
+        Cholesky<HMatrix, clover::cholesky_t<real>, N * Arg::nSpin / 2> cholesky(A[ch]);
+        A[ch] = cholesky.template invert<Mat>(); // return full inverse
+      }
 
-        real diag[2][6];
-        complex<real> tri[2][15];
-        const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
-        complex<real> ctmp;
+      if constexpr (Arg::twist) {
+        Mat A0 = arg.clover(x, arg.parity, ch);
+        A[ch] = static_cast<real>(0.5) * (A0 * A[ch]); // (1 + T + imu g_5)^{-1} = (1 + T - imu g_5)/((1 + T)^2 + mu^2)
+      }
+    }
 
-        for (int ch=0; ch<2; ++ch) {
-          // factor of two is inherent to QUDA clover storage
-          for (int i=0; i<6; i++) diag[ch][i] = 2.0*A[ch*36+i];
-          for (int i=0; i<15; i++) tri[ch][idtab[i]] = complex<real>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);
-        }
+    const Mat &A0 = A[0];
+    const Mat &A1 = A[1];
+
+#pragma unroll
+    for (int mu = 0; mu < 4; mu++) {
+#pragma unroll
+      for (int nu = 0; nu < 4; nu++) {
+        if (nu >= mu) continue;
+        Matrix<complex<real>, Arg::nColor> mat = {};
 
         // X, Y
         if (nu == 0) {
           if (mu == 1) {
-            for (int j=0; j<3; ++j) {
-              mat(j,j).y = diag[0][j+3] + diag[1][j+3] - diag[0][j] - diag[1][j];
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+              mat(j, j).imag(A0(j + N, j + N).real() + A1(j + N, j + N).real() - A0(j, j).real() - A1(j, j).real());
             }
 
             // triangular part
-            int jk=0;
-            for (int j=1; j<3; ++j) {
-              int jk2 = (j+3)*(j+2)/2 + 3;
+#pragma unroll
+            for (int j = 1; j < N; ++j) {
+#pragma unroll
               for (int k=0; k<j; ++k) {
-                ctmp = tri[0][jk2] + tri[1][jk2] - tri[0][jk] - tri[1][jk];
-
-                mat(j,k).x = -ctmp.imag();
-                mat(j,k).y =  ctmp.real();
-                mat(k,j).x =  ctmp.imag();
-                mat(k,j).y =  ctmp.real();
-
-                jk++; jk2++;
+                auto ctmp = A0(j + N, k + N) + A1(j + N, k + N) - A0(j, k) - A1(j, k);
+                mat(j, k) = i_(ctmp);
+                mat(k, j) = i_(conj(ctmp));
               }
             } // X Y
 
           } else if (mu == 2) {
 
-            for (int j=0; j<3; ++j) {
-              int jk = (j+3)*(j+2)/2;
-              for (int k=0; k<3; ++k) {
-                int kj = (k+3)*(k+2)/2 + j;
-                mat(j,k) = conj(tri[0][kj]) - tri[0][jk] + conj(tri[1][kj]) - tri[1][jk];
-                jk++;
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+#pragma unroll
+              for (int k = 0; k < N; ++k) {
+                mat(j, k) = conj(A0(k + N, j)) - A0(j + N, k) + conj(A1(k + N, j)) - A1(j + N, k);
               }
             } // X Z
 
           } else if (mu == 3) {
-            for (int j=0; j<3; ++j) {
-              int jk = (j+3)*(j+2)/2;
-              for (int k=0; k<3; ++k) {
-                int kj = (k+3)*(k+2)/2 + j;
-                ctmp = conj(tri[0][kj]) + tri[0][jk] - conj(tri[1][kj]) - tri[1][jk];
-                mat(j,k).x = -ctmp.imag();
-                mat(j,k).y =  ctmp.real();
-                jk++;
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+#pragma unroll
+              for (int k = 0; k < N; ++k) {
+                mat(j, k) = i_(conj(A0(k + N, j)) + A0(j + N, k) - conj(A1(k + N, j)) - A1(j + N, k));
               }
             }
           } // mu == 3 // X T
+
         } else if (nu == 1) {
           if (mu == 2) { // Y Z
-            for (int j=0; j<3; ++j) {
-              int jk = (j+3)*(j+2)/2;
-              for (int k=0; k<3; ++k) {
-                int kj = (k+3)*(k+2)/2 + j;
-                ctmp = conj(tri[0][kj]) + tri[0][jk] + conj(tri[1][kj]) + tri[1][jk];
-                mat(j,k).x =  ctmp.imag();
-                mat(j,k).y = -ctmp.real();
-                jk++;
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+#pragma unroll
+              for (int k = 0; k < N; ++k) {
+                mat(j, k) = -i_(conj(A0(k + N, j)) + A0(j + N, k) + conj(A1(k + N, j)) + A1(j + N, k));
               }
             }
           } else if (mu == 3){ // Y T
-            for (int j=0; j<3; ++j) {
-              int jk = (j+3)*(j+2)/2;
-              for (int k=0; k<3; ++k) {
-                int kj = (k+3)*(k+2)/2 + j;
-                mat(j,k) = conj(tri[0][kj]) - tri[0][jk] - conj(tri[1][kj]) + tri[1][jk];
-                jk++;
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+#pragma unroll
+              for (int k = 0; k < N; ++k) {
+                mat(j, k) = conj(A0(k + N, j)) - A0(j + N, k) - conj(A1(k + N, j)) + A1(j + N, k);
               }
             }
           } // mu == 3
         } // nu == 1
         else if (nu == 2){
-          if (mu == 3) {
-            for (int j=0; j<3; ++j) {
-              mat(j,j).y = diag[0][j] - diag[0][j+3] - diag[1][j] + diag[1][j+3];
+          if (mu == N) {
+#pragma unroll
+            for (int j = 0; j < N; ++j) {
+              mat(j, j).imag(A0(j, j).real() - A0(j + N, j + N).real() - A1(j, j).real() + A1(j + N, j + N).real());
             }
-            int jk=0;
-            for (int j=1; j<3; ++j) {
-              int jk2 = (j+3)*(j+2)/2 + 3;
+#pragma unroll
+            for (int j = 1; j < N; ++j) {
+#pragma unroll
               for (int k=0; k<j; ++k) {
-                ctmp = tri[0][jk] - tri[0][jk2] - tri[1][jk] + tri[1][jk2];
-                mat(j,k).x = -ctmp.imag();
-                mat(j,k).y =  ctmp.real();
-
-                mat(k,j).x = ctmp.imag();
-                mat(k,j).y = ctmp.real();
-                jk++; jk2++;
+                auto ctmp = A0(j, k) - A0(j + N, k + N) - A1(j, k) + A1(j + N, k + N);
+                mat(j, k) = i_(ctmp);
+                mat(k, j) = i_(conj(ctmp));
               }
             }
           }
         }
 
-        mat *= arg.coeff;
-        arg.output((mu-1)*mu/2 + nu, x, parity) = mat;
+        arg.output((mu - 1) * mu / 2 + nu, x, arg.parity) = arg.coeff * mat;
       } // nu
-    } // mu
+    }   // mu
   }
 
   template <typename Arg> struct CloverSigmaTr
@@ -154,11 +163,7 @@ namespace quda {
     constexpr CloverSigmaTr(const Arg &arg) : arg(arg) {}
     static constexpr const char* filename() { return KERNEL_FILE; }
 
-    __device__ __host__ inline void operator()(int x_cb)
-    {
-      // odd parity
-      cloverSigmaTraceCompute<Arg>(arg, x_cb, 1);
-    }
+    __device__ __host__ inline void operator()(int x_cb) { cloverSigmaTraceCompute<Arg>(arg, x_cb); }
   };
 
 }
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index a329fbb3dd..c4a038367b 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1388,9 +1388,10 @@ namespace quda {
   };
 
   template <> struct storeCoarseSharedAtomic_impl<true> {
-    template <typename Arg> using CacheT =
-      complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>,opDimsStatic<2,1,1>>;
+    template <typename Arg>
+    using CacheT = complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4]
+                                     [Arg::coarseSpin][Arg::coarseSpin];
+    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>, DimsStatic<2, 1, 1>>;
     template <typename Arg> using Ops = SpecialOps<Cache<Arg>>;
 
     template <bool allthreads, typename VUV, typename Pack, typename Ftor>
@@ -1402,8 +1403,6 @@ namespace quda {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      //__shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      //__shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
       Cache<Arg> cache{ftor};
       auto &X = cache.data()[0];
       auto &Y = cache.data()[1];
@@ -1428,7 +1427,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
 #pragma unroll
@@ -1458,7 +1456,6 @@ namespace quda {
         }
       }
 
-      //__syncthreads();
       cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index d675408c88..bf71d1ad02 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -174,15 +174,16 @@ namespace quda {
   };
 
   template <> struct site_max<true> {
-    template <typename Arg> static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-    template <typename Arg> static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-    template <typename Arg> static constexpr int color_spin_threads = (Arg::nSpin/Ms<Arg>) * (Arg::nColor/Mc<Arg>);
     template <typename Arg> struct CacheDims {
-      template <typename ...A> static constexpr dim3 dims(dim3 b, A &...) {
-	dim3 block = b;
-	if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-	block.y = color_spin_threads<Arg>; // state the y block since we know it at compile time
-	return block;
+      static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
+      static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
+      static constexpr int color_spin_threads = (Arg::nSpin / Ms) * (Arg::nColor / Mc);
+      static constexpr dim3 dims(dim3 block)
+      {
+        // pad the shared block size to avoid bank conflicts for native ordering
+        if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+        block.y = color_spin_threads; // state the y block since we know it at compile time
+        return block;
       }
     };
     template <typename Arg> using Cache = SharedMemoryCache<typename Arg::real, CacheDims<Arg>>;
@@ -192,12 +193,13 @@ namespace quda {
     {
       using Arg = typename Ftor::Arg;
       using real = typename Arg::real;
+      constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
       Cache<Arg> cache{ftor};
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
 #pragma unroll
-      for (int sc = 0; sc < color_spin_threads<Arg>; sc++) {
+      for (int sc = 0; sc < color_spin_threads; sc++) {
         auto sc_max = cache.load_y(sc);
         this_site_max = this_site_max > sc_max ? this_site_max : sc_max;
       }
diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh
index a21beead5e..ab985b5b4d 100644
--- a/include/kernels/dslash_clover_helper.cuh
+++ b/include/kernels/dslash_clover_helper.cuh
@@ -209,7 +209,6 @@ namespace quda {
 
       Mat A = arg.clover(x_cb, clover_parity, chirality);
 
-      //SharedMemoryCache<half_fermion> cache(target::block_dim());
       SharedMemoryCache<half_fermion> cache{*this};
 
       half_fermion in_chi[n_flavor]; // flavor array of chirally projected fermion
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 87e6eb75be..37decb5b56 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -303,7 +303,6 @@ namespace quda {
     //template <typename T, typename Arg> __device__ __host__ inline void operator()(T &out, int dir, int dim, const Arg &arg)
     template <typename T, typename Ftor> __device__ __host__ inline void operator()(T &out, int dir, int dim, const Ftor &ftor)
     {
-      //SharedMemoryCache<T> cache(target::block_dim());
       SharedMemoryCache<T> cache{ftor};
       // only need to write to shared memory if not master thread
       if (dim > 0 || dir) cache.save(out);
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 534cf3fe2f..0ed11abfe2 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -117,8 +117,8 @@ namespace quda
     using real = typename Arg::Float;
     using Complex = complex<real>;
     using Link = Matrix<complex<real>, Arg::nColor>;
-    using StapCacheT = ThreadLocalCache<Link,0,computeStapleRectangleOps>;
-    using RectCacheT = ThreadLocalCache<Link,0,StapCacheT>;
+    using StapCacheT = ThreadLocalCache<Link,0,computeStapleRectangleOps>;  // offset by computeStapleRectangleOps
+    using RectCacheT = ThreadLocalCache<Link,0,StapCacheT>;  // offset by StapCacheT
     using Ops = combineOps<computeStapleRectangleOps,SpecialOps<StapCacheT,RectCacheT>>;
   };
 
@@ -148,8 +148,6 @@ namespace quda
       }
 
       Link U, Q;
-      //SharedMemoryCache<Link> Stap(target::block_dim());
-      //SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link));
       typename OvrImpSTOUTOps<Arg>::StapCacheT Stap{*this};
       typename OvrImpSTOUTOps<Arg>::RectCacheT Rect{*this};
 
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 743074be4c..c725cb3c30 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -62,8 +62,8 @@ namespace quda
     using real = typename Arg::real;
     using Link = Matrix<complex<real>, Arg::nColor>;
     using WilsonOps = computeStapleOps;
-    using StapOp = ThreadLocalCache<Link,0,computeStapleRectangleOps>;
-    using RectOp = ThreadLocalCache<Link,0,StapOp>;
+    using StapOp = ThreadLocalCache<Link,0,computeStapleRectangleOps>;  // offset by computeStapleRectangleOps
+    using RectOp = ThreadLocalCache<Link,0,StapOp>;  // offset by StapOp
     using SymanzikOps = combineOps<computeStapleRectangleOps,SpecialOps<StapOp,RectOp>>;
     using Ops = std::conditional_t<Arg::wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW, SymanzikOps, WilsonOps>;
   };
@@ -92,12 +92,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      //SharedMemoryCache<Link> Stap(target::block_dim());
-      //SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link)); // offset to ensure non-overlapping allocations
       typename computeStapleOpsWF<Arg>::StapOp Stap{ftor};
       typename computeStapleOpsWF<Arg>::RectOp Rect{ftor};
-      //ThreadLocalCache<Link,0,computeStapleRectangleOps> Stap{ftor};
-      //ThreadLocalCache<Link,0,decltype(Stap)> Rect{ftor};
       computeStapleRectangle(ftor, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       //break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index c0156eab0b..5044943b37 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -394,7 +394,9 @@ namespace quda {
             2 multiplies, 1 add, 1 rescale
       */
       template <typename LinkCache>
-      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, LinkCache &Uab_cache) {
+      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu,
+                                                   LinkCache &Uab_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -565,7 +567,6 @@ namespace quda {
         int point_a = e_cb;
         int parity_a = parity;
 
-        //SharedMemoryCache<Link> Uab_cache(target::block_dim());
         ThreadLocalCache<Link> Uab_cache{*this};
         // Scoped load of Uab
         {
@@ -723,7 +724,8 @@ namespace quda {
             4 multiplies, 2 adds, 2 rescales
       */
       template <typename LinkCache>
-      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         auto mycoeff_seven = parity_sign<typename Arg::real>(parity_a) * coeff_sign<sig_positive, typename Arg::real>(parity_a) * arg.coeff_seven;
 
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -817,7 +819,6 @@ namespace quda {
           force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig);
           Matrix_cache.save(force_sig, 2);
         }
-
       }
 
       /**
@@ -836,7 +837,8 @@ namespace quda {
             2 multiplies, 2 adds, 2 rescales
       */
       template <typename LinkCache>
-      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int y[4] = {x[0], x[1], x[2], x[3]};
         int point_h = updateCoordExtendedIndexShiftMILC<flip_dir(nu_positive)>(y, arg.nu, arg);
         int parity_h = 1 - parity_a;
@@ -889,7 +891,8 @@ namespace quda {
             1 multiply, 1 add, 1 rescale
       */
       template <typename LinkCache>
-      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache) {
+      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -976,8 +979,8 @@ namespace quda {
 
         // calculate p5_sig
 	constexpr int cacheLen = sig_positive ? 3 : 2;
-        //ThreadLocalCache<array<Link,cacheLen>> Matrix_cache{};
-        ThreadLocalCache<Link,cacheLen> Matrix_cache{*this};
+        ThreadLocalCache<Link, cacheLen> Matrix_cache{*this};
+
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
           Matrix_cache.save(force_sig, 2);
diff --git a/include/lattice_field.h b/include/lattice_field.h
index b92297eabc..462add3bde 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -317,82 +317,82 @@ namespace quda {
     /**
        Pinned memory buffer used for sending messages
     */
-    array<void *, 2> my_face_h = {};
+    mutable array<void *, 2> my_face_h = {};
 
     /**
        Mapped version of my_face_h
     */
-    array<void *, 2> my_face_hd = {};
+    mutable array<void *, 2> my_face_hd = {};
 
     /**
        Device memory buffer for sending messages
      */
-    array<void *, 2> my_face_d = {};
+    mutable array<void *, 2> my_face_d = {};
 
     /**
        Local pointers to the pinned my_face buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_h = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_h = {};
 
     /**
        Local pointers to the mapped my_face buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_hd = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_hd = {};
 
     /**
        Local pointers to the device ghost_send buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_d = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> my_face_dim_dir_d = {};
 
     /**
        Memory buffer used for receiving all messages
     */
-    array<void *, 2> from_face_h = {};
+    mutable array<void *, 2> from_face_h = {};
 
     /**
        Mapped version of from_face_h
     */
-    array<void *, 2> from_face_hd = {};
+    mutable array<void *, 2> from_face_hd = {};
 
     /**
        Device memory buffer for receiving messages
      */
-    array<void *, 2> from_face_d = {};
+    mutable array<void *, 2> from_face_d = {};
 
     /**
        Local pointers to the pinned from_face buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_h = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_h = {};
 
     /**
        Local pointers to the mapped from_face buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_hd = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_hd = {};
 
     /**
        Local pointers to the device ghost_recv buffer
     */
-    array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_d = {};
+    mutable array_3d<void *, 2, QUDA_MAX_DIM, 2> from_face_dim_dir_d = {};
 
     /**
        Message handles for receiving
     */
-    array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_recv = {};
+    mutable array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_recv = {};
 
     /**
        Message handles for sending
     */
-    array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_send = {};
+    mutable array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_send = {};
 
     /**
        Message handles for receiving
     */
-    array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_recv_rdma = {};
+    mutable array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_recv_rdma = {};
 
     /**
        Message handles for sending
     */
-    array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_send_rdma = {};
+    mutable array_3d<MsgHandle *, 2, QUDA_MAX_DIM, 2> mh_send_rdma = {};
 
     /**
        Message handles for receiving
@@ -427,7 +427,7 @@ namespace quda {
     /**
        Whether we have initialized communication for this field
     */
-    bool initComms = false;
+    mutable bool initComms = false;
 
     /**
        Whether we have initialized peer-to-peer communication
@@ -543,17 +543,17 @@ namespace quda {
        @param[in] no_comms_fill Whether to allocate halo buffers for
        dimensions that are not partitioned
     */
-    void createComms(bool no_comms_fill = false);
+    void createComms(bool no_comms_fill = false) const;
 
     /**
        Destroy the communication handlers
     */
-    void destroyComms();
+    void destroyComms() const;
 
     /**
        Create the inter-process communication handlers
     */
-    void createIPCComms();
+    void createIPCComms() const;
 
     /**
        Destroy the statically allocated inter-process communication handlers
@@ -774,19 +774,19 @@ namespace quda {
      */
     void *remoteFace_r() const;
 
-    virtual void gather(int, const qudaStream_t &) { errorQuda("Not implemented"); }
+    virtual void gather(int, const qudaStream_t &) const { errorQuda("Not implemented"); }
 
-    virtual void commsStart(int, const qudaStream_t &, bool, bool) { errorQuda("Not implemented"); }
+    virtual void commsStart(int, const qudaStream_t &, bool, bool) const { errorQuda("Not implemented"); }
 
-    virtual int commsQuery(int, const qudaStream_t &, bool, bool)
+    virtual int commsQuery(int, const qudaStream_t &, bool, bool) const
     {
       errorQuda("Not implemented");
       return 0;
     }
 
-    virtual void commsWait(int, const qudaStream_t &, bool, bool) { errorQuda("Not implemented"); }
+    virtual void commsWait(int, const qudaStream_t &, bool, bool) const { errorQuda("Not implemented"); }
 
-    virtual void scatter(int, const qudaStream_t &) { errorQuda("Not implemented"); }
+    virtual void scatter(int, const qudaStream_t &) const { errorQuda("Not implemented"); }
 
     /** Return the volume string used by the autotuner */
     auto VolString() const { return vol_string; }
diff --git a/include/quda.h b/include/quda.h
index c8392b2054..d11fd4cbd0 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -1511,9 +1511,9 @@ extern "C" {
   void createCloverQuda(QudaInvertParam* param);
 
   /**
-   * Compute the clover force contributions in each dimension mu given
-   * the array of solution fields, and compute the resulting momentum
-   * field.
+   * Compute the clover force contributions from a set of partial
+   * fractions stemming from a rational approximation suitable for use
+   * within MILC.
    *
    * @param mom Force matrix
    * @param dt Integrating step size
@@ -1532,6 +1532,23 @@ extern "C" {
 			      int nvector, double multiplicity, void *gauge,
 			      QudaGaugeParam *gauge_param, QudaInvertParam *inv_param);
 
+  /**
+   * Compute the force from a clover or twisted clover determinant or
+   * a set of partial fractions stemming from a rational approximation
+   * suitable for use from within tmLQCD.
+   *
+   * @param h_mom Host force matrix
+   * @param h_x Array of solution vectors x_i = ( Q^2 + s_i )^{-1} b
+   * @param h_x0 Array of source vector necessary to compute the force of a ratio of determinant
+   * @param coeff Array of coefficients for the rational approximation or {1.0} for the determinant.
+   * @param nvector Number of solution vectors and coefficients
+   * @param gauge_param Gauge field meta data
+   * @param inv_param Dirac and solver meta data
+   * @param detratio if 0 compute the force of a determinant otherwise compute the force from a ratio of determinants
+   */
+  void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector,
+                                QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio);
+
   /**
    * Compute the naive staggered force.  All fields must be in the same precision.
    *
diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h
index 3f73709ca6..1ab313df4a 100644
--- a/include/reference_wrapper_helper.h
+++ b/include/reference_wrapper_helper.h
@@ -189,6 +189,15 @@ namespace quda
      */
     template <typename U> vector make_set(std::vector<U> &v) { return vector{v.begin(), v.end()}; }
 
+    /**
+       make_set is a helper function that creates a vector of
+       reference wrapped objects from the input reference argument.
+       This is the specialized overload that handles a vector_ref of
+       objects.  Used to convert a non-const set to a const set.
+       @param[in] v Vector argument we wish to wrap
+     */
+    template <typename U> vector make_set(const vector_ref<U> &v) { return vector {v.begin(), v.end()}; }
+
     /**
        make_set is a helper function that creates a vector of
        reference wrapped objects from the input reference argument.
diff --git a/include/targets/cuda/block_reduce_helper.h b/include/targets/cuda/block_reduce_helper.h
index 9408dd518d..59907822e8 100644
--- a/include/targets/cuda/block_reduce_helper.h
+++ b/include/targets/cuda/block_reduce_helper.h
@@ -2,7 +2,7 @@
 
 #include <target_device.h>
 #include <reducer.h>
-#include <helpers.h>
+#include <kernel_ops.h>
 
 /**
    @file block_reduce_helper.h
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 4a5420b166..0550ad62dd 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -6,6 +6,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index 01b9bf47be..fd2c724081 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -11,7 +11,6 @@
 namespace quda
 {
   template <typename T, int n> struct thread_array : array<T, n> {
-    //constexpr thread_array() : array<T,n>() {}
     template <typename Ops> constexpr thread_array(Ops &ops) : array<T,n>() {}
     static constexpr unsigned int shared_mem_size(dim3) { return 0; }
   };
diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h
index 035b638b1a..152dfd8a61 100644
--- a/include/targets/cuda/tunable_kernel.h
+++ b/include/targets/cuda/tunable_kernel.h
@@ -46,6 +46,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
 #else
@@ -63,6 +64,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       // note we do the copy to constant memory after the kernel has been compiled in launch_jitify
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
@@ -84,6 +86,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/targets/generic/kernel_ops.h b/include/targets/generic/kernel_ops.h
new file mode 100644
index 0000000000..20c163af75
--- /dev/null
+++ b/include/targets/generic/kernel_ops.h
@@ -0,0 +1,71 @@
+#pragma once
+
+namespace quda
+{
+
+  /**
+     @brief Used to declare an object of fixed size.
+   */
+  template <int N> struct SizeStatic {
+    static constexpr unsigned int size(dim3) { return N; }
+  };
+
+  /**
+     @brief Used to declare an object of fixed size per thread, N.
+   */
+  template <int N> struct SizePerThread {
+    static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; }
+  };
+
+  /**
+     @brief Used to declare an object of size equal to the size of the block Z dimension.
+   */
+  struct SizeZ {
+    static constexpr unsigned int size(dim3 block) {
+      return block.z;
+    }
+  };
+
+  /**
+     @brief Used to declare an object of size equal to the block size divided by the warp size.
+   */
+  struct SizeBlockDivWarp {
+    static constexpr unsigned int size(dim3 b) {
+      return (b.x * b.y * b.z + device::warp_size() - 1)/device::warp_size();
+    }
+  };
+
+  /**
+     @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
+   */
+  template <typename D, int N = 1> struct SizeDims {
+    static constexpr unsigned int size(dim3 block)
+    {
+      dim3 dims = D::dims(block);
+      return dims.x * dims.y * dims.z * N;
+    }
+  };
+
+  /**
+     @brief Used to declare an object with dimensions given by the block size.
+   */
+  struct DimsBlock {
+    static constexpr dim3 dims(dim3 block) { return block; }
+  };
+
+  /**
+     @brief Used to declare an object with fixed dimensions.
+   */
+  template <int x, int y, int z> struct DimsStatic {
+    static constexpr dim3 dims(dim3) { return dim3(x, y, z); }
+  };
+
+  /**
+     @brief Uniform helper for exposing type T, whether we are dealing
+     with an instance of T or some wrapper of T
+   */
+  template <class T, class enable = void> struct get_type {
+    using type = T;
+  };
+
+} // namespace quda
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 3a513b648d..d9dc4e66d7 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -5,6 +5,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   /**
      @brief Non-specialized load operation
   */
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index abe065ce0c..a456608fce 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include <helpers.h>
+#include <load_store.h>  // for atom_t
+#include <kernel_ops.h>
 #include <target_device.h>
 #include <shared_memory_helper.h>
 #include <special_ops.h>
@@ -12,49 +13,38 @@
    sharing data between threads in a thread block.
  */
 
-/**
-   @file shared_memory_cache_helper.h
-   @brief Convenience overloads to allow SharedMemoryCache objects to
-   appear in simple expressions.  The actual implementation of
-   SharedMemoryCache is target specific, and located in e.g.,
-   include/targets/cuda/shared_memory_cache_helper.h, etc.
- */
-
 namespace quda
 {
 
   /**
      @brief Class which wraps around a shared memory cache for type T,
      where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
+     the cache which any other thread can access.  The data is stored
+     in a coalesced order with element size atom_t<T>.
 
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
+     The dimensions of the cache is determined by a call to
+     D::dims(target::block_dim()), and D defaults to having dimensions
+     equal to the block dimensions.
 
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
+     A byte offset into the shared memory region can be specified with
+     the type O, and is given by
+     O::shared_mem_size(target::block_dim()) if O is not void.
    */
   template <typename T, typename D = DimsBlock, typename O = void>
-  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>
+  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     using dims_type = D;
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    using Smem = SharedMemory<atom_t<T>, SizeDims<D,sizeof(T)/sizeof(atom_t<T>)>, O>;
+    using offset_type = O;
     using Smem::shared_mem_size;
-    //using opSmem = op_SharedMemory<T, SizeSmem<Smem>>;
-    //using dependencies = op_Sequential<op_blockSync,opSmem>;
-    //using dependentOps = SpecialOps<op_blockSync,opSmem>;
 
   private:
+    const dim3 block;
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
 
@@ -62,12 +52,7 @@ namespace quda
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
     // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
-    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy,void>,T,void>;
-
-    const dim3 block;
-    const int stride;
-
-    using Smem::sharedMem;
+    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy, void>, T, void>;
 
     __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
     {
@@ -78,8 +63,7 @@ namespace quda
       for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
     }
 
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
     {
       atom_t tmp[n_element];
       int j = (z * block.y + y) * block.x + x;
@@ -106,31 +90,15 @@ namespace quda
 
   public:
     /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
+       @brief Constructor for SharedMemoryCache.
     */
-#if 0
-    constexpr SharedMemoryCache() :
-      block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
-    {
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
-    }
-#endif
-
     template <typename ...U, typename ...Arg>
-    HostDevice inline SharedMemoryCache(const SpecialOps<U...> &ops, Arg ...arg) :
+    constexpr SharedMemoryCache(const SpecialOps<U...> &ops, Arg ...arg) :
       Smem(ops), block(D::dims(target::block_dim(), arg...)), stride(block.x * block.y * block.z)
     {
       checkSpecialOp<SharedMemoryCache<T,D,O>,U...>();
-      static_assert(shared_mem_size(dim3{8,8,8})==Smem::get_offset(dim3{8,8,8})+SizeDims<D>::size(dim3{8,8,8})*sizeof(T));
+      static_assert(shared_mem_size(dim3 {32, 16, 8})
+		    == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims<D>::size(dim3 {32, 16, 8}) * sizeof(T));
     }
 
     constexpr SharedMemoryCache(const SharedMemoryCache<T,D,O> &) = delete;
@@ -138,9 +106,7 @@ namespace quda
     /**
        @brief Grab the raw base address to shared memory.
     */
-    __device__ __host__ inline auto data() const {
-      return reinterpret_cast<T *>(&sharedMem()[0]);
-    }
+    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(&sharedMem()[0]); }
 
     /**
        @brief Save the value into the 3-d shared memory cache.
@@ -216,8 +182,7 @@ namespace quda
        @param[in] x The x index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
     {
       auto tid = target::thread_idx();
       x = (x == -1) ? tid.x : x;
@@ -229,8 +194,7 @@ namespace quda
        @param[in] y The y index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
     {
       auto tid = target::thread_idx();
       y = (y == -1) ? tid.y : y;
@@ -242,8 +206,7 @@ namespace quda
        @param[in] z The z index to use
        @return The value at coordinates (x,y,z)
     */
-    template <typename dummy = void>
-    __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
     {
       auto tid = target::thread_idx();
       z = (z == -1) ? tid.z : z;
@@ -259,8 +222,7 @@ namespace quda
        @brief Cast operator to allow cache objects to be used where T
        is expected
      */
-    template <typename dummy = void>
-    __device__ __host__ operator maybeT<dummy>() const { return load(); }
+    template <typename dummy = void> __device__ __host__ operator maybeT<dummy>() const { return load(); }
 
     /**
        @brief Assignment operator to allow cache objects to be used on
@@ -315,11 +277,12 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or SharedMemoryCache<T>
+     with an instance of T or SharedMemoryCache<T,D,O>
    */
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
+  struct get_type<T,
+                  std::enable_if_t<std::is_same_v<
+                    T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/special_ops.h b/include/targets/generic/special_ops.h
index 05bc872cea..b86d0d1c86 100644
--- a/include/targets/generic/special_ops.h
+++ b/include/targets/generic/special_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <target_device.h>
-#include <helpers.h>
+#include <kernel_ops.h>
 
 namespace quda {
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
index 8c0633959f..e0275c9a28 100644
--- a/include/targets/generic/thread_array.h
+++ b/include/targets/generic/thread_array.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <helpers.h>
+#include <kernel_ops.h>
 #include <shared_memory_helper.h>
 #include <array.h>
 
@@ -8,26 +8,22 @@ namespace quda
 {
 
   /**
-     @brief Class that provides indexable per-thread storage.  On CUDA
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
+     @brief Class that provides indexable per-thread storage for n
+     elements of type T.  This version uses shared memory for storage.
+     The offset into the shared memory region is determined from the
+     type O.
    */
-  template <typename T, int n, typename O = void>
-  class thread_array : SharedMemory<array<T,n>, SizePerThread<1>, O>
+  template <typename T, int n, typename O = void> class thread_array : SharedMemory<array<T, n>, SizePerThread<1>, O>
   {
-  public:
-    using SharedMemoryT = SharedMemory<array<T,n>, SizePerThread<1>, O>;
-
-  private:
-    using SharedMemoryT::sharedMem;
+    using Smem = SharedMemory<array<T, n>, SizePerThread<1>, O>;
+    using Smem::sharedMem;
     array<T, n> &array_;
 
   public:
-    using SharedMemoryT::shared_mem_size;
+    using Smem::shared_mem_size;
 
 #if 0
-    __device__ __host__ constexpr thread_array() :
-      array_(sharedMem()[target::thread_idx_linear<3>()])
+    __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       array_ = array<T, n>(); // call default constructor
     }
@@ -44,7 +40,7 @@ namespace quda
 
     template <typename... U>
     __device__ __host__ constexpr thread_array(const SpecialOps<U...> &ops) :
-      SharedMemoryT(ops),
+      Smem(ops),
       array_(sharedMem()[target::thread_idx_linear<3>()])
     {
       checkSpecialOp<thread_array<T,n,O>,U...>();
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
index 340ca8a366..53e432556f 100644
--- a/include/targets/generic/thread_local_cache.h
+++ b/include/targets/generic/thread_local_cache.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <helpers.h>
+#include <kernel_ops.h>
 #include <target_device.h>
 #include <shared_memory_helper.h>
 
@@ -15,40 +15,38 @@ namespace quda
 {
 
   /**
-     @brief Class for threads to store a unique value, or array of values, which can use
-     shared memory for optimization purposes.
+     @brief Class for threads to store a unique value (for N_ == 0),
+     or array of values (for N_ > 0), which can use shared memory for
+     optimization purposes.
    */
-  template <typename T, int N_ = 0, typename O = void> class ThreadLocalCache :
-    SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>
+  template <typename T, int N_ = 0, typename O = void>
+  class ThreadLocalCache : SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>
   {
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>;
+
   public:
     using value_type = T;
     static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
-    using offset_type = O; // type of object that may also use shared memory at the same time and is located before this one
-    static constexpr int len = std::max(1,N); // actual number of elements to store
-    using SharedMemoryT = SharedMemory<atom_t<T>, SizePerThread<std::max(1,N_)*sizeof(T)/sizeof(atom_t<T>)>, O>;
-    using SharedMemoryT::sharedMem;
-    using SharedMemoryT::shared_mem_size;
-    //using opSmem = op_SharedMemory<T, SizeSmem<SharedMemoryT>>;
-    //using dependencies = opSmem;
-    //using dependentOps = SpecialOps<opSmem>;
+    using offset_type = O;       // type of object using shared memory at the same time that is located before this one
+    static constexpr int len = std::max(1, N); // actual number of elements to store
+    using Smem::shared_mem_size;
 
   private:
+    const int stride;
+    using Smem::sharedMem;
     using atom_t = atom_t<T>;
     static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
 
     // The number of elements of type atom_t that we break T into for optimal shared-memory access
     static constexpr int n_element = sizeof(T) / sizeof(atom_t);
 
-    const int stride;
-
     __device__ __host__ inline void save_detail(const T &a, const int k) const
     {
       atom_t tmp[n_element];
       memcpy(tmp, (void *)&a, sizeof(T));
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) sharedMem()[(k*n_element + i) * stride + j] = tmp[i];
+      for (int i = 0; i < n_element; i++) sharedMem()[(k * n_element + i) * stride + j] = tmp[i];
     }
 
     __device__ __host__ inline T load_detail(const int k) const
@@ -56,7 +54,7 @@ namespace quda
       atom_t tmp[n_element];
       int j = target::thread_idx_linear<3>();
 #pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k*n_element + i) * stride + j];
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k * n_element + i) * stride + j];
       T a;
       memcpy((void *)&a, tmp, sizeof(T));
       return a;
@@ -68,16 +66,16 @@ namespace quda
     */
 #if 0
     constexpr ThreadLocalCache() : stride(target::block_size<3>()) {
-      static_assert(shared_mem_size(dim3{8,8,8})==SharedMemoryT::get_offset(dim3{8,8,8})+SizePerThread<len>::size(dim3{8,8,8})*sizeof(T));
+      static_assert(shared_mem_size(dim3{32,16,8})==Smem::get_offset(dim3{32,16,8})+SizePerThread<len>::size(dim3{32,16,8})*sizeof(T));
     }
 #endif
 
     template <typename ...U>
-    constexpr ThreadLocalCache(const SpecialOps<U...> &ops) : SharedMemoryT(ops), stride(target::block_size<3>())
+    constexpr ThreadLocalCache(const SpecialOps<U...> &ops) : Smem(ops), stride(target::block_size<3>())
     {
       checkSpecialOp<ThreadLocalCache<T,N,O>,U...>();
-      static_assert(shared_mem_size(dim3{8,8,8})==
-		    SharedMemoryT::get_offset(dim3{8,8,8})+SizePerThread<len>::size(dim3{8,8,8})*sizeof(T));
+      static_assert(shared_mem_size(dim3{32,16,8})==
+		    Smem::get_offset(dim3{32,16,8})+SizePerThread<len>::size(dim3{32,16,8})*sizeof(T));
     }
 
     constexpr ThreadLocalCache(const ThreadLocalCache<T,N,O> &) = delete;
@@ -86,7 +84,8 @@ namespace quda
        @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
        @param[in] a The value to store in the thread local cache
      */
-    __device__ __host__ inline void save(const T &a) const {
+    __device__ __host__ inline void save(const T &a) const
+    {
       static_assert(N == 0);
       save_detail(a, 0);
     }
@@ -96,13 +95,18 @@ namespace quda
        @param[in] a The value to store in the thread local cache
        @param[in] k The index to use
      */
-    __device__ __host__ inline void save(const T &a, const int k) const { save_detail(a, k); }
+    __device__ __host__ inline void save(const T &a, const int k) const
+    {
+      static_assert(N > 0);
+      save_detail(a, k);
+    }
 
     /**
        @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
        @return The value stored in the thread local cache
      */
-    __device__ __host__ inline T load() const {
+    __device__ __host__ inline T load() const
+    {
       static_assert(N == 0);
       return load_detail(0);
     }
@@ -112,12 +116,17 @@ namespace quda
        @param[in] k The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ inline T load(const int k) const { return load_detail(k); }
+    __device__ __host__ inline T load(const int k) const
+    {
+      static_assert(N > 0);
+      return load_detail(k);
+    }
 
     /**
        @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
      */
-    __device__ __host__ operator T() const {
+    __device__ __host__ operator T() const
+    {
       static_assert(N == 0);
       return load();
     }
@@ -126,7 +135,8 @@ namespace quda
        @brief Assignment operator to allow cache objects to be used on
        the lhs where T is otherwise expected (when N==0).
      */
-    __device__ __host__ void operator=(const T &src) const {
+    __device__ __host__ void operator=(const T &src) const
+    {
       static_assert(N == 0);
       save(src);
     }
@@ -136,36 +146,46 @@ namespace quda
        @param[in] i The index to use
        @return The value stored in the thread local cache at that index
      */
-    __device__ __host__ T operator[](int i) { return load(i); }
+    __device__ __host__ T operator[](int i)
+    {
+      static_assert(N > 0);
+      return load(i);
+    }
   };
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, int N, typename O> __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
@@ -178,10 +198,11 @@ namespace quda
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or ThreadLocalCache<T,O>
+     with an instance of T or ThreadLocalCache<T,N,O>
    */
   template <class T>
-  struct get_type<T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
+  struct get_type<
+    T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index 7b387bf2df..d1bfe4a955 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -5,6 +5,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index 0f470bfcb6..4e20a663b3 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -58,7 +58,7 @@ namespace quda
        @brief Byte offset for this shared memory object.
     */
     template <typename ...Arg>
-    static constexpr unsigned int get_offset(dim3 block, Arg ...arg)
+    static constexpr unsigned int get_offset(dim3 block, Arg &...arg)
     {
       unsigned int o = 0;
       if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block, arg...); }
@@ -81,7 +81,7 @@ namespace quda
     /**
        @brief Return this SharedMemory object.
     */
-    HostDevice constexpr auto sharedMem() const { return *this; }
+    constexpr auto sharedMem() const { return *this; }
 
     /**
        @brief Subscripting operator returning a reference to element.
diff --git a/include/targets/hip/tunable_kernel.h b/include/targets/hip/tunable_kernel.h
index 626e590438..b4bc0d07cf 100644
--- a/include/targets/hip/tunable_kernel.h
+++ b/include/targets/hip/tunable_kernel.h
@@ -43,6 +43,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
       return launch_error;
     }
@@ -51,6 +52,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
       qudaMemcpyAsync(device::get_constant_buffer<Arg>(), &arg, sizeof(Arg), qudaMemcpyHostToDevice, stream);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
@@ -67,6 +69,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index 322b664376..f75abf87e2 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -171,12 +171,17 @@ namespace quda
         return true;
       } else { // block.x (spacetime) was reset
 
+        auto next = param;
+        next.block.z += step_z;
+        auto shared_bytes = setSharedBytes(next);
+
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + step_z) <= device::max_threads_per_block()
-            && ((param.block.z + step_z) <= max_block_z)) {
+            && ((param.block.z + step_z) <= max_block_z) && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z += step_z;
           param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = step_z;
diff --git a/include/tunable_nd.h b/include/tunable_nd.h
index 8b995326d0..d3c782e157 100644
--- a/include/tunable_nd.h
+++ b/include/tunable_nd.h
@@ -283,8 +283,7 @@ namespace quda
 
         auto next = param;
         next.block.y += step_y;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.y < vector_length_y && param.block.y < device::max_threads_per_block_dim(1)
@@ -297,7 +296,6 @@ namespace quda
         } else { // we have run off the end so let's reset
           param.block.y = step_y;
           param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
-
           return false;
         }
       }
@@ -312,8 +310,7 @@ namespace quda
       Tunable::initTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -325,8 +322,7 @@ namespace quda
       Tunable::defaultTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -552,8 +548,7 @@ namespace quda
 
         auto next = param;
         next.block.z += step_z;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
@@ -580,8 +575,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::initTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -593,8 +587,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::defaultTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
index f055684d52..0afe1f6e32 100644
--- a/include/tunable_reduction.h
+++ b/include/tunable_reduction.h
@@ -175,6 +175,7 @@ namespace quda
     {
       TunableKernel::initTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
 
     /**
@@ -185,6 +186,7 @@ namespace quda
     {
       TunableKernel::defaultTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
   };
 
@@ -341,11 +343,17 @@ namespace quda
       if (rtn) {
         return true;
       } else {
+
+        auto next = param;
+        next.block.z++;
+        auto shared_bytes = setSharedBytes(next);
+
         if (param.block.z < n_batch && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + 1) <= device::max_threads_per_block()
-            && param.block.z < n_batch_block_max) {
+            && param.block.z < n_batch_block_max && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z++;
           param.grid.z = (n_batch + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = 1;
@@ -364,6 +372,7 @@ namespace quda
       TunableReduction2D::initTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
 
     /**
@@ -375,6 +384,7 @@ namespace quda
       TunableReduction2D::defaultTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
   };
 
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 9da6a82411..c2cc221d4e 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -133,6 +133,13 @@ namespace quda {
       }
     }
 
+    auto setSharedBytes(TuneParam &param) const
+    {
+      int nthreads = param.block.x * param.block.y * param.block.z;
+      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      return param.shared_bytes;
+    }
+
     virtual bool advanceBlockDim(TuneParam &param) const
     {
       const unsigned int max_threads = maxBlockSize(param);
@@ -140,14 +147,12 @@ namespace quda {
       bool ret;
 
       param.block.x += blockStep();
-      int nthreads = param.block.x * param.block.y * param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
 
       if (param.block.x > max_threads || param.shared_bytes > max_shared
           || param.block.x * param.block.y * param.block.z > device::max_threads_per_block()) {
         resetBlockDim(param);
-        int nthreads = param.block.x * param.block.y * param.block.z;
-        param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+        setSharedBytes(param);
         ret = false;
       } else {
         ret = true;
@@ -197,8 +202,7 @@ namespace quda {
 	if (param.shared_bytes > max_shared) {
 	  TuneParam next(param);
 	  advanceBlockDim(next); // to get next blockDim
-	  int nthreads = next.block.x * next.block.y * next.block.z;
-          param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(next));
+          param.shared_bytes = setSharedBytes(next);
           return false;
 	} else {
 	  return true;
@@ -284,8 +288,7 @@ namespace quda {
 
 	param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
       }
-      int nthreads = param.block.x*param.block.y*param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
     }
 
     /** sets default values for when tuning is disabled */
@@ -335,6 +338,23 @@ namespace quda {
         errorQuda("aux tuning enabled but param.aux is not initialized");
     }
 
+    /**
+     * @brief self-consistency check that the shared memory is set
+     * correctly (e.g., check that block size has been correctly
+     * factored in when set setting shared_bytes)
+     */
+    void checkSharedBytes(const TuneParam &tp) const
+    {
+      auto tp2 = TuneParam(tp);
+      auto expected = setSharedBytes(tp2);
+      if (tp.shared_bytes < expected)
+        errorQuda("Shared bytes %u insufficient (expected %u)", tp.shared_bytes, expected);
+
+      if (sharedBytesPerThread() && sharedBytesPerBlock(tp))
+        errorQuda("Not supported: non-zero shared bytes per thread (%u) and per block (%u)", sharedBytesPerThread(),
+                  sharedBytesPerBlock(tp));
+    }
+
     /**
      * @brief Return the rank on which kernel tuning is performed.
      * This will default to 0, but can be globally overriden with the
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 6c18ccae43..d8c1d8342b 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -52,6 +52,7 @@ set (QUDA_OBJS
   blas_quda.cu multi_blas_quda.cu reduce_quda.cu
   multi_reduce_quda.cu reduce_helper.cu
   contract.cu comm_common.cpp communicator_stack.cpp
+  clover_force.cpp
   clover_deriv_quda.cu clover_invert.cu copy_gauge_extended.cu
   extract_gauge_ghost_extended.cu copy_color_spinor.cpp
   spinor_noise.cu spinor_dilute.cu
diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 24615f84ce..ae78e95b10 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -1,97 +1,68 @@
 #include <tunable_nd.h>
 #include <gauge_field.h>
+#include <clover_field.h>
 #include <kernels/clover_deriv.cuh>
+#include <instantiate.h>
 
 namespace quda {
 
-  template <typename Float, QudaReconstructType recon>
-  class DerivativeClover : TunableKernel3D {
+  template <typename Float, int nColor, QudaReconstructType recon> class DerivativeClover : TunableKernel3D
+  {
     GaugeField &force;
-    GaugeField &gauge;
-    GaugeField &oprod;
+    const GaugeField &gauge;
+    const GaugeField &oprod;
     double coeff;
-    int parity;
-    unsigned int minThreads() const { return gauge.LocalVolumeCB(); }
-    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
+    unsigned int minThreads() const override { return gauge.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for thread_array
 
   public:
-    DerivativeClover(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity) :
-      TunableKernel3D(gauge, 2, 4),
-      force(force),
-      gauge(gauge),
-      oprod(oprod),
-      coeff(coeff),
-      parity(parity)
+    DerivativeClover(const GaugeField &gauge, GaugeField &force, const GaugeField &oprod, double coeff) :
+      TunableKernel3D(gauge, 2, 4), force(force), gauge(gauge), oprod(oprod), coeff(coeff)
     {
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream){
+    void apply(const qudaStream_t &stream) override
+    {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      launch<CloverDerivative>(tp, stream, CloverDerivArg<Float, recon>(force, gauge, oprod, coeff, parity));
+      launch<CloverDerivative>(tp, stream, CloverDerivArg<Float, nColor, recon>(force, gauge, oprod, coeff));
     }
 
     // The force field is updated so we must preserve its initial state
-    void preTune() { force.backup(); }
-    void postTune() { force.restore(); }
+    void preTune() override { force.backup(); }
+    void postTune() override { force.restore(); }
 
-    long long flops() const { return 16 * 198 * 3 * 4 * gauge.LocalVolume(); }
-    long long bytes() const
+    long long flops() const override
+    {
+      auto gemm_flops = 8 * nColor * nColor * nColor - 2 * nColor * nColor;
+      return 32 * gemm_flops * 12 * gauge.LocalVolume();
+    }
+    long long bytes() const override
     {
-      return ((8 * gauge.Reconstruct() + 4 * oprod.Reconstruct()) * 3 + 2 * force.Reconstruct()) * 4 * gauge.LocalVolume() * gauge.Precision();
+      return (16 * gauge.Reconstruct() + 8 * oprod.Reconstruct() + 2 * force.Reconstruct()) * 12 * gauge.Precision()
+        * gauge.LocalVolume();
     }
   };
 
-  template<typename Float>
-  void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity)
+  void cloverDerivative(GaugeField &force, const GaugeField &gauge, const GaugeField &oprod, double coeff)
   {
-    if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
-    if (force.Order() != oprod.Order()) errorQuda("Force and Oprod orders must match");
-    if (force.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
+    if constexpr (is_enabled_clover()) {
+      checkPrecision(force, gauge, oprod);
+      assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
+      assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
+      if (oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
+      if (force.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Force field does not support reconstruction");
 
-    if (force.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
-      if (gauge.isNative()) {
-	if (gauge.Reconstruct() == QUDA_RECONSTRUCT_NO) {
-	  DerivativeClover<Float, QUDA_RECONSTRUCT_NO> deriv(force, gauge, oprod, coeff, parity);
-	} else {
-	  errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());
-	}
-      } else {
-	errorQuda("Gauge order %d not supported", gauge.Order());
-      }
-    } else {
-      errorQuda("Force order %d not supported", force.Order());
-    } // force / oprod order
-  }
+      GaugeField *oprodEx = createExtendedGauge(oprod, gauge.R(), getProfile());
 
-#if defined(GPU_CLOVER_DIRAC) && (QUDA_PRECISION & 8)
-  void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, QudaParity parity)
-  {
-    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
-    assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
-
-    for (int d=0; d<4; d++) {
-      if (oprod.X()[d] != gauge.X()[d])
-        errorQuda("Incompatible extended dimensions d=%d gauge=%d oprod=%d", d, gauge.X()[d], oprod.X()[d]);
-    }
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+      instantiate<DerivativeClover, ReconstructNo12>(gauge, force, *oprodEx, coeff);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
 
-    if (force.Precision() == QUDA_DOUBLE_PRECISION) {
-      cloverDerivative<double>(force, gauge, oprod, coeff, (parity == QUDA_EVEN_PARITY) ? 0 : 1);
+      delete oprodEx;
     } else {
-      errorQuda("Precision %d not supported", force.Precision());
+      errorQuda("Clover has not been built");
     }
-    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-  }
-#else
-  void cloverDerivative(GaugeField &, GaugeField &, GaugeField &, double, QudaParity)
-  {
-#ifdef GPU_CLOVER_DIRAC
-    errorQuda("QUDA_PRECISION=%d does not enable double precision", QUDA_PRECISION);
-#else
-    errorQuda("Clover has not been built");
-#endif
   }
-#endif
 
 } // namespace quda
diff --git a/lib/clover_force.cpp b/lib/clover_force.cpp
new file mode 100644
index 0000000000..cf9706c359
--- /dev/null
+++ b/lib/clover_force.cpp
@@ -0,0 +1,88 @@
+#include "clover_field.h"
+#include "gauge_field.h"
+#include "color_spinor_field.h"
+#include "momentum.h"
+#include "blas_quda.h"
+#include "dirac_quda.h"
+
+namespace quda
+{
+
+  void computeCloverForce(GaugeField &mom, const GaugeField &gaugeEx, const GaugeField &gauge,
+                          const CloverField &clover, cvector_ref<ColorSpinorField> &x, cvector_ref<ColorSpinorField> &x0,
+                          const std::vector<double> &coeff, const std::vector<array<double, 2>> &epsilon,
+                          double sigma_coeff, bool detratio, QudaInvertParam &inv_param)
+  {
+    if (inv_param.matpc_type != QUDA_MATPC_EVEN_EVEN_ASYMMETRIC && inv_param.matpc_type != QUDA_MATPC_ODD_ODD_ASYMMETRIC)
+      errorQuda("MatPC type %d not supported", inv_param.matpc_type);
+
+    QudaParity parity = inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY;
+    QudaParity other_parity = static_cast<QudaParity>(1 - parity);
+    bool dagger = inv_param.dagger;
+    bool not_dagger = static_cast<QudaDagType>(1 - inv_param.dagger);
+
+    DiracParam diracParam;
+    setDiracParam(diracParam, &inv_param, true);
+    Dirac *dirac = Dirac::create(diracParam);
+
+    ColorSpinorParam csParam(x[0]);
+    csParam.create = QUDA_NULL_FIELD_CREATE;
+    std::vector<ColorSpinorField> p(x.size());
+    for (auto i = 0u; i < p.size(); i++) p[i] = ColorSpinorField(csParam);
+
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+    for (auto i = 0u; i < x.size(); i++) {
+      gamma5(p[i][parity], x[i][parity]);
+
+      if (dagger) dirac->Dagger(QUDA_DAG_YES);
+      dirac->Dslash(x[i][other_parity], p[i][parity], other_parity);
+      // want to apply \hat Q_{-} = \hat M_{+}^\dagger \gamma_5 to get Y_o
+      dirac->M(p[i][parity], p[i][parity]); // this is the odd part of Y
+      if (dagger) dirac->Dagger(QUDA_DAG_NO);
+
+      gamma5(x[i][other_parity], x[i][other_parity]);
+      if (detratio) blas::xpy(x0[i][parity], p[i][parity]);
+
+      if (not_dagger) dirac->Dagger(QUDA_DAG_YES);
+      dirac->Dslash(p[i][other_parity], p[i][parity], other_parity); // and now the even part of Y
+      if (not_dagger) dirac->Dagger(QUDA_DAG_NO);
+      // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD
+      // x.Even= X.Even.tmLQCD/kappa and p.Even=-Y.Even.tmLQCD/kappa
+
+      // the gamma5 application in tmLQCD is done inside deriv_Sb
+      gamma5(p[i], p[i]);
+    }
+
+    delete dirac;
+
+    // create oprod and trace field
+    GaugeFieldParam param(mom);
+    param.link_type = QUDA_GENERAL_LINKS;
+    param.reconstruct = QUDA_RECONSTRUCT_NO;
+    param.create = QUDA_ZERO_FIELD_CREATE;
+    param.setPrecision(param.Precision(), true);
+    GaugeField force(param);
+    param.geometry = QUDA_TENSOR_GEOMETRY;
+    GaugeField oprod(param);
+
+    // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus  deriv_Sb(EO,...) in tmLQCD
+    computeCloverForce(force, gauge, x, p, coeff);
+    // derivative of the determinant of the sw term, second term of (A12) in hep-lat/0112051,  sw_deriv(EE, mnl->mu) in tmLQCD
+    if (!detratio) computeCloverSigmaTrace(oprod, clover, sigma_coeff, other_parity);
+
+    // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051,  sw_spinor_eo(EE,..) plus
+    // sw_spinor_eo(OO,..)  in tmLQCD
+    computeCloverSigmaOprod(oprod, inv_param.dagger == QUDA_DAG_YES ? p : x, inv_param.dagger == QUDA_DAG_YES ? x : p,
+                            epsilon);
+
+    // oprod = (A12) of hep-lat/0112051
+    // compute the insertion of oprod in Fig.27 of hep-lat/0112051
+    cloverDerivative(force, gaugeEx, oprod, 1.0);
+
+    updateMomentum(mom, -1.0, force, "clover");
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+  }
+
+} // namespace quda
diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu
index 903ce4e76c..dae2780c0a 100644
--- a/lib/clover_invert.cu
+++ b/lib/clover_invert.cu
@@ -46,20 +46,17 @@ namespace quda {
     void postTune() { if (clover::dynamic_inverse()) clover.restore(); }
   };
 
-#ifdef GPU_CLOVER_DIRAC
   void cloverInvert(CloverField &clover, bool computeTraceLog)
   {
-    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    if (clover.Reconstruct()) errorQuda("Cannot store the inverse with a reconstruct field");
-    if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
-    instantiate<CloverInvert>(clover, computeTraceLog);
-    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-  }
-#else
-  void cloverInvert(CloverField &, bool)
-  {
-    errorQuda("Clover has not been built");
+    if constexpr (is_enabled_clover()) {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+      if (clover.Reconstruct()) errorQuda("Cannot store the inverse with a reconstruct field");
+      if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
+      instantiate<CloverInvert>(clover, computeTraceLog);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    } else {
+      errorQuda("Clover has not been built");
+    }
   }
-#endif
 
 } // namespace quda
diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu
index d579476714..4750bcfb8c 100644
--- a/lib/clover_outer_product.cu
+++ b/lib/clover_outer_product.cu
@@ -20,7 +20,7 @@ namespace quda {
     const real coeff;
     OprodKernelType kernel;
     int dir;
-    unsigned int minThreads() const { return kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]; }
+    unsigned int minThreads() const override { return kernel == INTERIOR ? inB.VolumeCB() : inB.GhostFaceCB()[dir]; }
 
   public:
     CloverForce(const GaugeField &U, GaugeField &force, const ColorSpinorField& inA,
@@ -43,20 +43,17 @@ namespace quda {
       apply(device::get_default_stream());
 
       for (int i=3; i>=0; i--) {
+        dir = i;
         if (!commDimPartitioned(i)) continue;
         strcpy(aux, aux2);
-        strcat(aux, ",exterior");
-        if (dir==0) strcat(aux, ",dir=0");
-        else if (dir==1) strcat(aux, ",dir=1");
-        else if (dir==2) strcat(aux, ",dir=2");
-        else if (dir==3) strcat(aux, ",dir=3");
+        strcat(aux, ",exterior,dir=");
+        strcat(aux, dir == 0 ? "0" : dir == 1 ? "1" : dir == 2 ? "2" : "3");
         kernel = EXTERIOR;
-        dir = i;
         apply(device::get_default_stream());
       }
     }
 
-    void apply(const qudaStream_t &stream)
+    void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
 
@@ -73,13 +70,13 @@ namespace quda {
       }
     }
 
-    void preTune() { force.backup(); }
-    void postTune() { force.restore(); }
+    void preTune() override { force.backup(); }
+    void postTune() override { force.restore(); }
 
     // spin trace + multiply-add (ignore spin-project)
-    long long flops() const { return minThreads() * (144 + 234) * (kernel == INTERIOR ? 4 : 1); }
+    long long flops() const override { return minThreads() * (144 + 234) * (kernel == INTERIOR ? 4 : 1); }
 
-    long long bytes() const
+    long long bytes() const override
     {
       if (kernel == INTERIOR) {
 	return inA.Bytes() + inC.Bytes() + 4*(inB.Bytes() + inD.Bytes()) + force.Bytes() + U.Bytes() / 2;
@@ -90,10 +87,12 @@ namespace quda {
     }
   }; // CloverForce
 
-  void exchangeGhost(ColorSpinorField &a, int parity, int dag) {
+  void exchangeGhost(const ColorSpinorField &a, int parity, int dag)
+  {
     // this sets the communications pattern for the packing kernel
     int comms[QUDA_MAX_DIM] = { commDimPartitioned(0), commDimPartitioned(1),
                                 commDimPartitioned(2), commDimPartitioned(3) };
+
     setPackComms(comms);
 
     // first transfer src1
@@ -132,42 +131,35 @@ namespace quda {
     comm_barrier();
   }
 
-#ifdef GPU_CLOVER_DIRAC
-  void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
-                          std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
+  void computeCloverForce(GaugeField &force, const GaugeField &U, cvector_ref<const ColorSpinorField> &x,
+                          cvector_ref<const ColorSpinorField> &p, const std::vector<double> &coeff)
   {
-    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    checkNative(*x[0], *p[0], force, U);
-    checkPrecision(*x[0], *p[0], force, U);
-
-    int dag = 1;
-
-    for (unsigned int i=0; i<x.size(); i++) {
-      x[i]->Even().allocateGhostBuffer(1);
-      x[i]->Odd().allocateGhostBuffer(1);
-      p[i]->Even().allocateGhostBuffer(1);
-      p[i]->Odd().allocateGhostBuffer(1);
-
-      for (int parity=0; parity<2; parity++) {
-	ColorSpinorField& inA = (parity&1) ? p[i]->Odd() : p[i]->Even();
-	ColorSpinorField& inB = (parity&1) ? x[i]->Even(): x[i]->Odd();
-	ColorSpinorField& inC = (parity&1) ? x[i]->Odd() : x[i]->Even();
-	ColorSpinorField& inD = (parity&1) ? p[i]->Even(): p[i]->Odd();
-
-        exchangeGhost(inB, parity, dag);
-        exchangeGhost(inD, parity, 1-dag);
-
-        instantiate<CloverForce, ReconstructNo12>(U, force, inA, inB, inC, inD, parity, coeff[i]);
+    if constexpr (is_enabled_clover()) {
+      checkNative(x[0], p[0], force, U);
+      checkPrecision(x[0], p[0], force, U);
+
+      int dag = 1;
+
+      for (auto i = 0u; i < x.size(); i++) {
+        for (int parity = 0; parity < 2; parity++) {
+          const ColorSpinorField &inA = (parity & 1) ? x[i].Odd() : x[i].Even();
+          const ColorSpinorField &inB = (parity & 1) ? p[i].Even() : p[i].Odd();
+          const ColorSpinorField &inC = (parity & 1) ? p[i].Odd() : p[i].Even();
+          const ColorSpinorField &inD = (parity & 1) ? x[i].Even() : x[i].Odd();
+
+          getProfile().TPSTART(QUDA_PROFILE_COMMS);
+          exchangeGhost(inB, parity, dag);
+          exchangeGhost(inD, parity, 1 - dag);
+          getProfile().TPSTOP(QUDA_PROFILE_COMMS);
+
+          getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+          instantiate<CloverForce, ReconstructNo12>(U, force, inA, inB, inC, inD, parity, coeff[i]);
+          getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+        }
       }
+    } else {
+      errorQuda("Clover Dirac operator has not been built!");
     }
-    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-  }
-#else // GPU_CLOVER_DIRAC not defined
-  void computeCloverForce(GaugeField &, const GaugeField &, std::vector<ColorSpinorField *> &,
-                          std::vector<ColorSpinorField *> &, std::vector<double> &)
-  {
-    errorQuda("Clover Dirac operator has not been built!");
   }
-#endif
 
 } // namespace quda
diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu
index c000310f6b..44dbc45221 100644
--- a/lib/clover_quda.cu
+++ b/lib/clover_quda.cu
@@ -34,21 +34,18 @@ namespace quda {
     long long bytes() const { return 2*arg.threads.x*(6*arg.f.Bytes() + arg.clover.Bytes()); }
   };
 
-#ifdef GPU_CLOVER_DIRAC
   void computeClover(CloverField &clover, const GaugeField& f, double coeff)
   {
-    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
-    clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields
-    instantiate<ComputeClover>(clover, f, coeff);
-    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-  }
-#else
-  void computeClover(CloverField &, const GaugeField &, double)
-  {
-    errorQuda("Clover has not been built");
+    if constexpr (is_enabled_clover()) {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+      if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
+      clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields
+      instantiate<ComputeClover>(clover, f, coeff);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    } else {
+      errorQuda("Clover has not been built");
+    }
   }
-#endif
 
 } // namespace quda
 
diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu
index 370ada813f..a6b9750e79 100644
--- a/lib/clover_sigma_outer_product.cu
+++ b/lib/clover_sigma_outer_product.cu
@@ -14,20 +14,15 @@ namespace quda {
   {
     template <int nvector> using Arg = CloverSigmaOprodArg<Float, nColor, nvector>;
     GaugeField &oprod;
-    const std::vector<ColorSpinorField*> &inA;
-    const std::vector<ColorSpinorField*> &inB;
-    const std::vector<std::vector<double>> &coeff;
-    unsigned int minThreads() const { return oprod.VolumeCB(); }
+    cvector_ref<const ColorSpinorField> &inA;
+    cvector_ref<const ColorSpinorField> &inB;
+    const std::vector<array<double, 2>> &coeff;
+    unsigned int minThreads() const override { return oprod.VolumeCB(); }
 
   public:
-    CloverSigmaOprod(GaugeField &oprod, const std::vector<ColorSpinorField*> &inA,
-                     const std::vector<ColorSpinorField*> &inB,
-                     const std::vector<std::vector<double>> &coeff) :
-      TunableKernel3D(oprod, 2, 6),
-      oprod(oprod),
-      inA(inA),
-      inB(inB),
-      coeff(coeff)
+    CloverSigmaOprod(GaugeField &oprod, cvector_ref<const ColorSpinorField> &inA,
+                     cvector_ref<const ColorSpinorField> &inB, const std::vector<array<double, 2>> &coeff) :
+      TunableKernel3D(oprod, 2, 6), oprod(oprod), inA(inA), inB(inB), coeff(coeff)
     {
       char tmp[16];
       sprintf(tmp, ",nvector=%lu", inA.size());
@@ -35,7 +30,7 @@ namespace quda {
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream)
+    void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       switch (inA.size()) {
@@ -44,54 +39,38 @@ namespace quda {
       }
     } // apply
 
-    void preTune() { oprod.backup(); }
-    void postTune() { oprod.restore(); }
+    void preTune() override { oprod.backup(); }
+    void postTune() override { oprod.restore(); }
 
-    long long flops() const
+    long long flops() const override
     {
       return ((144 + 18) * inA.size() + 18) * 6 * oprod.Volume(); // spin trace + multiply-add
     }
-    long long bytes() const
-    {
-      return (inA[0]->Bytes() + inB[0]->Bytes()) * inA.size() * 6 + 2 * oprod.Bytes();
-    }
+    long long bytes() const override { return (inA[0].Bytes() + inB[0].Bytes()) * inA.size() * 6 + 2 * oprod.Bytes(); }
   }; // CloverSigmaOprod
 
-#ifdef GPU_CLOVER_DIRAC
-  void computeCloverSigmaOprod(GaugeField& oprod, std::vector<ColorSpinorField*> &x,
-			       std::vector<ColorSpinorField*> &p, std::vector<std::vector<double> > &coeff)
+  void computeCloverSigmaOprod(GaugeField &oprod, cvector_ref<const ColorSpinorField> &x,
+                               cvector_ref<const ColorSpinorField> &p, const std::vector<array<double, 2>> &coeff)
   {
-    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    if (x.size() > MAX_NVECTOR) {
-      // divide and conquer
-      std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
-      std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
-      std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
-      for (unsigned int i=0; i<coeff0.size(); i++) {
-	coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
-      }
-      computeCloverSigmaOprod(oprod, x0, p0, coeff0);
+    if constexpr (is_enabled_clover()) {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+      if (x.size() > MAX_NVECTOR) {
+        // divide and conquer
+        computeCloverSigmaOprod(oprod, cvector_ref<const ColorSpinorField> {x.begin(), x.begin() + x.size() / 2},
+                                cvector_ref<const ColorSpinorField> {p.begin(), p.begin() + p.size() / 2},
+                                {coeff.begin(), coeff.begin() + coeff.size() / 2});
 
-      std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
-      std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
-      std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
-      for (unsigned int i=0; i<coeff1.size(); i++) {
-	coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
+        computeCloverSigmaOprod(oprod, cvector_ref<const ColorSpinorField> {x.begin() + x.size() / 2, x.end()},
+                                cvector_ref<const ColorSpinorField> {p.begin() + p.size() / 2, p.end()},
+                                {coeff.begin() + coeff.size() / 2, coeff.end()});
+        return;
       }
-      computeCloverSigmaOprod(oprod, x1, p1, coeff1);
 
-      return;
+      instantiate<CloverSigmaOprod>(oprod, x, p, coeff);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    } else {
+      errorQuda("Clover Dirac operator has not been built!");
     }
-
-    instantiate<CloverSigmaOprod>(oprod, x, p, coeff);
-    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-  }
-#else // GPU_CLOVER_DIRAC not defined
-  void computeCloverSigmaOprod(GaugeField &, std::vector<ColorSpinorField*> &,
-			       std::vector<ColorSpinorField*> &, std::vector<std::vector<double> > &)
-  {
-    errorQuda("Clover Dirac operator has not been built!");
   }
-#endif
 
 } // namespace quda
diff --git a/lib/clover_trace_quda.cu b/lib/clover_trace_quda.cu
index cb0fc82131..5292957125 100644
--- a/lib/clover_trace_quda.cu
+++ b/lib/clover_trace_quda.cu
@@ -10,39 +10,49 @@ namespace quda {
   class CloverSigmaTrace : TunableKernel1D {
     GaugeField &output;
     const CloverField &clover;
+    const bool twisted;
     Float coeff;
-    unsigned int minThreads() const { return clover.VolumeCB(); }
+    const int parity;
+    unsigned int minThreads() const override { return clover.VolumeCB(); }
 
   public:
-    CloverSigmaTrace(GaugeField& output, const CloverField& clover, double coeff) :
+    CloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff, int parity) :
       TunableKernel1D(output),
       output(output),
       clover(clover),
-      coeff(static_cast<Float>(coeff))
+      twisted(clover.TwistFlavor() == QUDA_TWIST_SINGLET || clover.TwistFlavor() == QUDA_TWIST_NONDEG_DOUBLET),
+      coeff(static_cast<Float>(coeff)),
+      parity(parity)
     {
+      if (twisted) strcat(aux, ",twisted");
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream){
+    void apply(const qudaStream_t &stream) override
+    {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      launch<CloverSigmaTr>(tp, stream, CloverTraceArg<Float, nColor>(output, clover, coeff));
+      if (twisted) {
+        launch<CloverSigmaTr>(tp, stream, CloverTraceArg<Float, nColor, true>(output, clover, coeff, parity));
+      } else {
+        launch<CloverSigmaTr>(tp, stream, CloverTraceArg<Float, nColor, false>(output, clover, coeff, parity));
+      }
     }
 
-    long long flops() const { return 0; } // Fix this
-    long long bytes() const { return clover.Bytes() + output.Bytes(); }
+    void preTune() override { output.backup(); }
+    void postTune() override { output.restore(); }
+
+    long long flops() const override { return 0; } // Fix this
+    long long bytes() const override { return clover.Bytes() + output.Bytes(); }
   };
 
-#ifdef GPU_CLOVER_DIRAC
-  void computeCloverSigmaTrace(GaugeField& output, const CloverField& clover, double coeff)
+  void computeCloverSigmaTrace(GaugeField &output, const CloverField &clover, double coeff, int parity)
   {
-    checkNative(output, clover);
-    instantiate<CloverSigmaTrace>(output, clover, coeff);
-  }
-#else
-  void computeCloverSigmaTrace(GaugeField&, const CloverField&, double)
-  {
-    errorQuda("Clover has not been built");
+    if constexpr (is_enabled_clover()) {
+      checkNative(output, clover);
+      instantiate<CloverSigmaTrace>(output, clover, coeff, parity);
+    } else {
+      errorQuda("Clover has not been built");
+    }
   }
-#endif
 
 } // namespace quda
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 46c190f433..d2a6c93d8d 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -972,7 +972,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::createComms(int nFace, bool spin_project)
+  void ColorSpinorField::createComms(int nFace, bool spin_project) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     allocateGhostBuffer(nFace, spin_project); // allocate the ghost buffer if not yet allocated
@@ -1008,7 +1008,7 @@ namespace quda
   // pack the ghost zone into a contiguous buffer for communications
   void ColorSpinorField::packGhost(const int nFace, const QudaParity parity, const int dagger, const qudaStream_t &stream,
                                    MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label,
-                                   bool spin_project, double a, double b, double c, int shmem)
+                                   bool spin_project, double a, double b, double c, int shmem) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     void *packBuffer[4 * QUDA_MAX_DIM] = {};
@@ -1049,7 +1049,7 @@ namespace quda
 
   void ColorSpinorField::pack(int nFace, int parity, int dagger, const qudaStream_t &stream,
                               MemoryLocation location[2 * QUDA_MAX_DIM], MemoryLocation location_label,
-                              bool spin_project, double a, double b, double c, int shmem)
+                              bool spin_project, double a, double b, double c, int shmem) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     createComms(nFace, spin_project); // must call this first
@@ -1057,7 +1057,8 @@ namespace quda
     packGhost(nFace, (QudaParity)parity, dagger, stream, location, location_label, spin_project, a, b, c, shmem);
   }
 
-  void ColorSpinorField::sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir, const qudaStream_t &stream)
+  void ColorSpinorField::sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir,
+                                   const qudaStream_t &stream) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     void *gpu_buf
@@ -1066,7 +1067,7 @@ namespace quda
   }
 
   void ColorSpinorField::unpackGhost(const void *ghost_spinor, const int dim, const QudaDirection dir,
-                                     const qudaStream_t &stream)
+                                     const qudaStream_t &stream) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     const void *src = ghost_spinor;
@@ -1076,7 +1077,7 @@ namespace quda
     qudaMemcpyAsync(ghost_dst, src, ghost_face_bytes[dim], qudaMemcpyHostToDevice, stream);
   }
 
-  void ColorSpinorField::gather(int dir, const qudaStream_t &stream)
+  void ColorSpinorField::gather(int dir, const qudaStream_t &stream) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     int dim = dir / 2;
@@ -1094,7 +1095,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::recvStart(int d, const qudaStream_t &, bool gdr)
+  void ColorSpinorField::recvStart(int d, const qudaStream_t &, bool gdr) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     // note this is scatter centric, so dir=0 (1) is send backwards
@@ -1114,7 +1115,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::sendStart(int d, const qudaStream_t &stream, bool gdr, bool remote_write)
+  void ColorSpinorField::sendStart(int d, const qudaStream_t &stream, bool gdr, bool remote_write) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     // note this is scatter centric, so dir=0 (1) is send backwards
@@ -1148,7 +1149,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::commsStart(int dir, const qudaStream_t &stream, bool gdr_send, bool gdr_recv)
+  void ColorSpinorField::commsStart(int dir, const qudaStream_t &stream, bool gdr_send, bool gdr_recv) const
   {
     recvStart(dir, stream, gdr_recv);
     sendStart(dir, stream, gdr_send);
@@ -1157,7 +1158,7 @@ namespace quda
   static bool complete_recv[QUDA_MAX_DIM][2] = {};
   static bool complete_send[QUDA_MAX_DIM][2] = {};
 
-  int ColorSpinorField::commsQuery(int d, const qudaStream_t &, bool gdr_send, bool gdr_recv)
+  int ColorSpinorField::commsQuery(int d, const qudaStream_t &, bool gdr_send, bool gdr_recv) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     // note this is scatter centric, so dir=0 (1) is send backwards
@@ -1198,7 +1199,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::commsWait(int d, const qudaStream_t &, bool gdr_send, bool gdr_recv)
+  void ColorSpinorField::commsWait(int d, const qudaStream_t &, bool gdr_send, bool gdr_recv) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     // note this is scatter centric, so dir=0 (1) is send backwards
@@ -1231,7 +1232,7 @@ namespace quda
     }
   }
 
-  void ColorSpinorField::scatter(int dim_dir, const qudaStream_t &stream)
+  void ColorSpinorField::scatter(int dim_dir, const qudaStream_t &stream) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
     // note this is scatter centric, so input expects dir=0 (1) is send backwards
@@ -1288,7 +1289,7 @@ namespace quda
       }
 
       if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
-      const_cast<ColorSpinorField &>(*this).createComms(nFace, false);
+      createComms(nFace, false);
 
       if (pack_destination[0] != Shmem) {
 
@@ -1370,8 +1371,7 @@ namespace quda
         }
 
         // prepost receive
-        for (int i = 0; i < 2 * nDimComms; i++)
-          const_cast<ColorSpinorField *>(this)->recvStart(i, device::get_default_stream(), gdr_recv);
+        for (int i = 0; i < 2 * nDimComms; i++) recvStart(i, device::get_default_stream(), gdr_recv);
 
         // FIXME use events to properly synchronize streams, logic below failed when using p2p in all 4 dimensions (DGX2)
         bool sync = true;
@@ -1387,8 +1387,7 @@ namespace quda
           for (int dim = 0; dim < nDimComms; dim++) {
             for (int dir = 0; dir < 2; dir++) {
               if ((comm_peer2peer_enabled(dir, dim) + p2p) % 2 == 0) { // issue non-p2p transfers first
-                const_cast<ColorSpinorField *>(this)->sendStart(2 * dim + dir, device::get_stream(2 * dim + dir),
-                                                                gdr_send);
+                sendStart(2 * dim + dir, device::get_stream(2 * dim + dir), gdr_send);
               }
             }
           }
@@ -1400,8 +1399,8 @@ namespace quda
           for (int dim = 0; dim < nDimComms; dim++) {
             for (int dir = 0; dir < 2; dir++) {
               if (!comms_complete[dim * 2 + dir]) {
-                comms_complete[2 * dim + dir] = const_cast<ColorSpinorField *>(this)->commsQuery(
-                  2 * dim + dir, device::get_default_stream(), gdr_send, gdr_recv);
+                comms_complete[2 * dim + dir]
+                  = commsQuery(2 * dim + dir, device::get_default_stream(), gdr_send, gdr_recv);
                 if (comms_complete[2 * dim + dir]) {
                   comms_done++;
                   if (comm_peer2peer_enabled(1 - dir, dim))
diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu
index 4ecf7c8b02..b985893325 100644
--- a/lib/dslash5_domain_wall.cu
+++ b/lib/dslash5_domain_wall.cu
@@ -71,8 +71,10 @@ namespace quda
       // FIXME: actually, the shared object is still constructed even if not used
       if (mobius_m5::shared()) {
         // spin components in shared depend on inversion algorithm
-	bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS || type == Dslash5Type::M5_INV_ZMOBIUS;
-        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin();
+        bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
+          || type == Dslash5Type::M5_INV_ZMOBIUS;
+        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() :
+                                                           in.Nspin();
         return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
       } else {
         return 0;
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index 3d6db85853..d2acdf369f 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -77,9 +77,9 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
-	return 0;
+        return 0;
       } else {
-	return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+        return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
       }
     }
 
diff --git a/lib/dslash_policy.cuh b/lib/dslash_policy.cuh
index 483b91d65f..ecb5f91eb7 100644
--- a/lib/dslash_policy.cuh
+++ b/lib/dslash_policy.cuh
@@ -120,15 +120,15 @@ namespace quda
      @param[in] stream Stream were the receive is being posted (effectively ignored)
      @param[in] gdr Whether we are using GPU Direct RDMA or not
   */
-  template <typename Dslash>
-  inline void issueRecv(ColorSpinorField &input, const Dslash &dslash, bool gdr)
-  {
-    for(int i=3; i>=0; i--){
-      if (!dslash.dslashParam.commDim[i]) continue;
-      for(int dir=1; dir>=0; dir--) {
-        PROFILE(if (dslash_comms) input.recvStart(2*i+dir, device::get_stream(2*i+dir), gdr), profile, QUDA_PROFILE_COMMS_START);
+    template <typename Dslash> inline void issueRecv(const ColorSpinorField &input, const Dslash &dslash, bool gdr)
+    {
+      for (int i = 3; i >= 0; i--) {
+        if (!dslash.dslashParam.commDim[i]) continue;
+        for (int dir = 1; dir >= 0; dir--) {
+          PROFILE(if (dslash_comms) input.recvStart(2 * i + dir, device::get_stream(2 * i + dir), gdr), profile,
+                  QUDA_PROFILE_COMMS_START);
+        }
       }
-    }
   }
 
   /**
@@ -142,7 +142,7 @@ namespace quda
      @param[in] packIndex Stream index where the packing kernel will run
   */
   template <typename Dslash>
-  inline void issuePack(ColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location,
+  inline void issuePack(const ColorSpinorField &in, const Dslash &dslash, int parity, MemoryLocation location,
                         int packIndex, int shmem = 0)
   {
     auto &arg = dslash.dslashParam;
@@ -185,7 +185,7 @@ namespace quda
      @param[out] in Field that whose halos we are communicating
      @param[in] dslash The dslash object
   */
-  template <typename Dslash> inline void issueGather(ColorSpinorField &in, const Dslash &dslash)
+  template <typename Dslash> inline void issueGather(const ColorSpinorField &in, const Dslash &dslash)
   {
 
     for (int i = 3; i >=0; i--) {
@@ -255,8 +255,8 @@ namespace quda
      @param[in] scatterIndex The stream index used for posting the host-to-device memory copy in
    */
   template <typename Dslash>
-  inline bool commsComplete(ColorSpinorField &in, const Dslash &, int dim, int dir, bool gdr_send,
-                            bool gdr_recv, bool zero_copy_recv, int scatterIndex = -1)
+  inline bool commsComplete(const ColorSpinorField &in, const Dslash &, int dim, int dir, bool gdr_send, bool gdr_recv,
+                            bool zero_copy_recv, int scatterIndex = -1)
   {
     PROFILE(int comms_test = dslash_comms ? in.commsQuery(2*dim+dir, device::get_stream(2*dim+dir), gdr_send, gdr_recv) : 1, profile, QUDA_PROFILE_COMMS_QUERY);
     if (comms_test) {
@@ -317,7 +317,7 @@ namespace quda
      @param[in,out] in The ColorSpinorField source
      @param[in] to_mapped Whether we are switching to mapped ghosts or not
    */
-  template <typename Dslash> inline void setMappedGhost(Dslash &dslash, ColorSpinorField &in, bool to_mapped)
+  template <typename Dslash> inline void setMappedGhost(Dslash &dslash, const ColorSpinorField &in, bool to_mapped)
   {
     static char aux_copy[TuneKey::aux_n];
     static bool set_mapped = false;
@@ -343,7 +343,7 @@ namespace quda
 
   template <typename Dslash> struct DslashPolicyImp {
 
-    virtual void operator()(Dslash &, ColorSpinorField *, const int, const int *, TimeProfile &) { }
+    virtual void operator()(Dslash &, const ColorSpinorField &, const int, const int *, TimeProfile &) { }
 
     virtual ~DslashPolicyImp() { }
   };
@@ -353,8 +353,8 @@ namespace quda
   */
   template <typename Dslash> struct DslashBasic : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
       profile.TPSTART(QUDA_PROFILE_TOTAL);
       auto &dslashParam = dslash.dslashParam;
@@ -362,14 +362,14 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
-      issueGather(*in, dslash);
+      issueGather(in, dslash);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -389,16 +389,16 @@ namespace quda
               if (event_test) {
                 pattern.gatherCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
-                PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                        device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                        false, dslashParam.remote_write),
-                    profile, QUDA_PROFILE_COMMS_START);
+                PROFILE(if (dslash_comms) in.sendStart(
+                          2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir), false,
+                          dslashParam.remote_write),
+                        profile, QUDA_PROFILE_COMMS_START);
               }
             }
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, false)) {
+              if (commsComplete(in, dslash, i, dir, false, false, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -431,8 +431,8 @@ namespace quda
         }
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -452,7 +452,7 @@ namespace quda
   template <typename Dslash, int shmem> struct DslashShmemGeneric : DslashPolicyImp<Dslash> {
 
 #ifdef NVSHMEM_COMMS
-    void operator()(Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB,
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
                     TimeProfile &profile)
     {
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -470,9 +470,7 @@ namespace quda
       const int packIndex = device::get_default_stream_idx();
       constexpr MemoryLocation location = static_cast<MemoryLocation>(Shmem);
 
-      if (!((shmem & 2) and (shmem & 1))) {
-        issuePack(*in, dslash, 1 - dslashParam.parity, location, packIndex, shmem);
-      }
+      if (!((shmem & 2) and (shmem & 1))) { issuePack(in, dslash, 1 - dslashParam.parity, location, packIndex, shmem); }
 
       dslash.setPack(((shmem & 2) or (shmem & 1)), location); // enable fused kernel packing
 
@@ -489,11 +487,11 @@ namespace quda
       }
 
       dslash::inc_dslash_shmem_sync_counter();
-      in->bufferIndex = (1 - in->bufferIndex);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
 #else
-    void operator()(Dslash &, ColorSpinorField *, const int, const int *, TimeProfile &)
+    void operator()(Dslash &, const ColorSpinorField &, const int, const int *, TimeProfile &)
     {
       errorQuda("NVSHMEM Dslash policies not built.");
     }
@@ -510,8 +508,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashFusedExterior : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -521,14 +519,14 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
-      issueGather(*in, dslash);
+      issueGather(in, dslash);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -549,16 +547,16 @@ namespace quda
               if (event_test) {
                 pattern.gatherCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
-                PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                        device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                        false, dslashParam.remote_write),
-                    profile, QUDA_PROFILE_COMMS_START);
+                PROFILE(if (dslash_comms) in.sendStart(
+                          2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir), false,
+                          dslashParam.remote_write),
+                        profile, QUDA_PROFILE_COMMS_START);
               }
             }
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, false, scatterIndex)) {
+              if (commsComplete(in, dslash, i, dir, false, false, false, scatterIndex)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -584,8 +582,8 @@ namespace quda
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -595,8 +593,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashGDR : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -606,11 +604,11 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -622,16 +620,16 @@ namespace quda
           if (!dslashParam.commDim[i]) continue;
 
           if (!pack_event) {
-            qudaEventSynchronize(packEnd[in->bufferIndex]);
+            qudaEventSynchronize(packEnd[in.bufferIndex]);
             pack_event = true;
           }
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      true, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     true, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -646,8 +644,7 @@ namespace quda
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, true, true, false)) {
-                ;
+              if (commsComplete(in, dslash, i, dir, true, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -668,8 +665,8 @@ namespace quda
         }
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -679,8 +676,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashFusedGDR : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -690,11 +687,11 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -706,16 +703,16 @@ namespace quda
           if (!dslashParam.commDim[i]) continue;
 
           if (!pack_event) {
-            qudaEventSynchronize(packEnd[in->bufferIndex]);
+            qudaEventSynchronize(packEnd[in.bufferIndex]);
             pack_event = true;
           }
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      true, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     true, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }
         }
@@ -730,7 +727,7 @@ namespace quda
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, true, true, false)) {
+              if (commsComplete(in, dslash, i, dir, true, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -745,8 +742,8 @@ namespace quda
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -756,8 +753,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashGDRRecv : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -767,14 +764,14 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
-      issueGather(*in, dslash);
+      issueGather(in, dslash);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -794,16 +791,16 @@ namespace quda
               if (event_test) {
                 pattern.gatherCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
-                PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                        device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                        false, dslashParam.remote_write),
-                    profile, QUDA_PROFILE_COMMS_START);
+                PROFILE(if (dslash_comms) in.sendStart(
+                          2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir), false,
+                          dslashParam.remote_write),
+                        profile, QUDA_PROFILE_COMMS_START);
               }
             }
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, true, false)) {
+              if (commsComplete(in, dslash, i, dir, false, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -824,8 +821,8 @@ namespace quda
         }
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -835,8 +832,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashFusedGDRRecv : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -846,14 +843,14 @@ namespace quda
       dslashParam.threads = volume;
       dslash.setShmem(0);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       const int packIndex = device::get_default_stream_idx();
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Device | (Remote * dslashParam.remote_write)),
                 packIndex);
 
-      issueGather(*in, dslash);
+      issueGather(in, dslash);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -873,16 +870,17 @@ namespace quda
               if (event_test) {
                 pattern.gatherCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
-                PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                        dslashParam.remote_write ? device::get_default_stream() : device::get_stream(2 * i + dir),
-                                                        false, dslashParam.remote_write),
-                    profile, QUDA_PROFILE_COMMS_START);
+                PROFILE(if (dslash_comms) in.sendStart(2 * i + dir,
+                                                       dslashParam.remote_write ? device::get_default_stream() :
+                                                                                  device::get_stream(2 * i + dir),
+                                                       false, dslashParam.remote_write),
+                        profile, QUDA_PROFILE_COMMS_START);
               }
             }
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, true, false)) {
+              if (commsComplete(in, dslash, i, dir, false, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -897,8 +895,8 @@ namespace quda
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -909,8 +907,8 @@ namespace quda
   */
   template <typename Dslash> struct DslashZeroCopyPack : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -921,15 +919,16 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       const int packIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -949,10 +948,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -968,7 +967,7 @@ namespace quda
 
             // Query if comms have finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, false)) {
+              if (commsComplete(in, dslash, i, dir, false, false, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1000,8 +999,8 @@ namespace quda
         }
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1012,8 +1011,8 @@ namespace quda
 */
   template <typename Dslash> struct DslashFusedZeroCopyPack : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1024,16 +1023,17 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
       const int packScatterIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packScatterIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packScatterIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packScatterIndex);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -1052,11 +1052,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(
-                  if (dslash_comms) in->sendStart(2 * i + dir,
-                                                  device::get_stream(dslashParam.remote_write ? packScatterIndex : 2 * i + dir),
-                                                  false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms) in.sendStart(
+                        2 * i + dir, device::get_stream(dslashParam.remote_write ? packScatterIndex : 2 * i + dir),
+                        false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -1072,7 +1071,7 @@ namespace quda
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, false, packScatterIndex)) {
+              if (commsComplete(in, dslash, i, dir, false, false, false, packScatterIndex)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1097,8 +1096,8 @@ namespace quda
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1108,8 +1107,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashZeroCopyPackGDRRecv : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1120,15 +1119,16 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       const int packIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -1148,10 +1148,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -1167,7 +1167,7 @@ namespace quda
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, true, false)) {
+              if (commsComplete(in, dslash, i, dir, false, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1188,8 +1188,8 @@ namespace quda
         }
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1200,8 +1200,8 @@ namespace quda
  */
   template <typename Dslash> struct DslashFusedZeroCopyPackGDRRecv : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1212,16 +1212,17 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
       const int packIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packIndex);
 
-      issueRecv(*in, dslash, true); // Prepost receives
+      issueRecv(in, dslash, true); // Prepost receives
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       if (aux_worker) aux_worker->apply(device::get_default_stream());
@@ -1240,10 +1241,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -1259,7 +1260,7 @@ namespace quda
 
             // Query if comms has finished
             if (!pattern.commsCompleted[2 * i + dir] && pattern.gatherCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, true, false)) {
+              if (commsComplete(in, dslash, i, dir, false, true, false)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1274,8 +1275,8 @@ namespace quda
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1286,8 +1287,8 @@ namespace quda
 */
   template <typename Dslash> struct DslashZeroCopy : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1298,15 +1299,16 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       const int packIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -1326,10 +1328,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -1345,7 +1347,7 @@ namespace quda
 
             // Query if comms have finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, true)) {
+              if (commsComplete(in, dslash, i, dir, false, false, true)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1358,16 +1360,16 @@ namespace quda
             dslashParam.kernel_type = static_cast<KernelType>(i);
             dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
 
-            setMappedGhost(dslash, *in, true);
+            setMappedGhost(dslash, in, true);
             PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
-            setMappedGhost(dslash, *in, false);
+            setMappedGhost(dslash, in, false);
 
             pattern.dslashCompleted[2 * i] = 1;
           }
         }
       }
 
-      in->bufferIndex = (1 - in->bufferIndex);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1378,8 +1380,8 @@ namespace quda
 */
   template <typename Dslash> struct DslashFusedZeroCopy : DslashPolicyImp<Dslash> {
 
-    void operator()(
-        Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB, TimeProfile &profile)
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
+                    TimeProfile &profile)
     {
 
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1390,15 +1392,16 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       const int packIndex = getStreamIndex(dslashParam);
-      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in->bufferIndex], 0), profile,
-          QUDA_PROFILE_STREAM_WAIT_EVENT);
-      const int parity_src = (in->SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
-      issuePack(*in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
+      PROFILE(qudaStreamWaitEvent(device::get_stream(packIndex), dslashStart[in.bufferIndex], 0), profile,
+              QUDA_PROFILE_STREAM_WAIT_EVENT);
+      const int parity_src = (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET ? 1 - dslashParam.parity : 0);
+      issuePack(in, dslash, parity_src, static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write)),
                 packIndex);
 
       PROFILE(if (dslash_interior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
@@ -1418,10 +1421,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
-                                                      false, dslashParam.remote_write),
-                  profile, QUDA_PROFILE_COMMS_START);
+              PROFILE(if (dslash_comms)
+                        in.sendStart(2 * i + dir, device::get_stream(dslashParam.remote_write ? packIndex : 2 * i + dir),
+                                     false, dslashParam.remote_write),
+                      profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
         }     // i
@@ -1437,7 +1440,7 @@ namespace quda
 
             // Query if comms have finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, true)) {
+              if (commsComplete(in, dslash, i, dir, false, false, true)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1448,13 +1451,13 @@ namespace quda
 
       if (pattern.commDimTotal) {
         setFusedParam(dslashParam, dslash, faceVolumeCB); // setup for exterior kernel
-        setMappedGhost(dslash, *in, true);
+        setMappedGhost(dslash, in, true);
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
-        setMappedGhost(dslash, *in, false);
+        setMappedGhost(dslash, in, false);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1466,7 +1469,7 @@ namespace quda
   */
   template <typename Dslash> struct DslashFusedPack : DslashPolicyImp<Dslash> {
 
-    void operator()(Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB,
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
                     TimeProfile &profile)
     {
 
@@ -1478,9 +1481,10 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       MemoryLocation location = static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write));
       dslash.setPack(true, location); // enable fused kernel packing
@@ -1504,9 +1508,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      dslashParam.remote_write ? device::get_default_stream() : device::get_stream(2 * i + dir),
-                                                      false, dslashParam.remote_write),
+              PROFILE(if (dslash_comms) in.sendStart(2 * i + dir,
+                                                     dslashParam.remote_write ? device::get_default_stream() :
+                                                                                device::get_stream(2 * i + dir),
+                                                     false, dslashParam.remote_write),
                       profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
@@ -1523,7 +1528,7 @@ namespace quda
 
             // Query if comms have finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, true)) {
+              if (commsComplete(in, dslash, i, dir, false, false, true)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1537,16 +1542,16 @@ namespace quda
             dslashParam.kernel_type = static_cast<KernelType>(i);
             dslashParam.threads = dslash.Nface() * faceVolumeCB[i]; // updating 2 or 6 faces
 
-            setMappedGhost(dslash, *in, true);
+            setMappedGhost(dslash, in, true);
             PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
-            setMappedGhost(dslash, *in, false);
+            setMappedGhost(dslash, in, false);
 
             pattern.dslashCompleted[2 * i] = 1;
           }
         }
       }
 
-      in->bufferIndex = (1 - in->bufferIndex);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1559,7 +1564,7 @@ namespace quda
   */
   template <typename Dslash> struct DslashFusedPackFusedHalo : DslashPolicyImp<Dslash> {
 
-    void operator()(Dslash &dslash, ColorSpinorField *in, const int volume, const int *faceVolumeCB,
+    void operator()(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *faceVolumeCB,
                     TimeProfile &profile)
     {
       profile.TPSTART(QUDA_PROFILE_TOTAL);
@@ -1570,9 +1575,10 @@ namespace quda
       dslash.setShmem(0);
 
       // record start of the dslash
-      PROFILE(qudaEventRecord(dslashStart[in->bufferIndex], device::get_default_stream()), profile, QUDA_PROFILE_EVENT_RECORD);
+      PROFILE(qudaEventRecord(dslashStart[in.bufferIndex], device::get_default_stream()), profile,
+              QUDA_PROFILE_EVENT_RECORD);
 
-      issueRecv(*in, dslash, false); // Prepost receives
+      issueRecv(in, dslash, false); // Prepost receives
 
       MemoryLocation location = static_cast<MemoryLocation>(Host | (Remote * dslashParam.remote_write));
       dslash.setPack(true, location); // enable fused kernel packing
@@ -1596,9 +1602,10 @@ namespace quda
 
           for (int dir = 1; dir >= 0; dir--) {
             if ((comm_peer2peer_enabled(dir, i) + p2p) % 2 == 0) {
-              PROFILE(if (dslash_comms) in->sendStart(2 * i + dir,
-                                                      dslashParam.remote_write ? device::get_default_stream() : device::get_stream(2 * i + dir),
-                                                      false, dslashParam.remote_write),
+              PROFILE(if (dslash_comms) in.sendStart(2 * i + dir,
+                                                     dslashParam.remote_write ? device::get_default_stream() :
+                                                                                device::get_stream(2 * i + dir),
+                                                     false, dslashParam.remote_write),
                       profile, QUDA_PROFILE_COMMS_START);
             } // is p2p?
           }   // dir
@@ -1615,7 +1622,7 @@ namespace quda
 
             // Query if comms have finished
             if (!pattern.commsCompleted[2 * i + dir]) {
-              if (commsComplete(*in, dslash, i, dir, false, false, true)) {
+              if (commsComplete(in, dslash, i, dir, false, false, true)) {
                 pattern.commsCompleted[2 * i + dir] = 1;
                 pattern.completeSum++;
               }
@@ -1627,13 +1634,13 @@ namespace quda
       if (pattern.commDimTotal) {
         setFusedParam(dslashParam, dslash,
                       faceVolumeCB); // setup for exterior kernel
-        setMappedGhost(dslash, *in, true);
+        setMappedGhost(dslash, in, true);
         PROFILE(if (dslash_exterior_compute) dslash.apply(device::get_default_stream()), profile, QUDA_PROFILE_DSLASH_KERNEL);
-        setMappedGhost(dslash, *in, false);
+        setMappedGhost(dslash, in, false);
       }
 
-      completeDslash(*in, dslashParam);
-      in->bufferIndex = (1 - in->bufferIndex);
+      completeDslash(in, dslashParam);
+      in.bufferIndex = (1 - in.bufferIndex);
       profile.TPSTOP(QUDA_PROFILE_TOTAL);
     }
   };
@@ -1751,7 +1758,7 @@ namespace quda
     Dslash &dslash;
     using Arg = std::remove_reference_t<decltype(dslash.dslashParam)>;
     Arg &dslashParam;
-    ColorSpinorField &in;
+    const ColorSpinorField &in;
     const int volume;
     const int *ghostFace;
     TimeProfile &profile;
@@ -1762,14 +1769,9 @@ namespace quda
     unsigned int sharedBytesPerBlock(const TuneParam &) const override { return 0; }
 
   public:
-    DslashPolicyTune(
-        Dslash &dslash, const ColorSpinorField &in, const int volume, const int *ghostFace, TimeProfile &profile) :
-        dslash(dslash),
-        dslashParam(dslash.dslashParam),
-        in(const_cast<ColorSpinorField &>(in)),
-        volume(volume),
-        ghostFace(ghostFace),
-        profile(profile)
+    DslashPolicyTune(Dslash &dslash, const ColorSpinorField &in, const int volume, const int *ghostFace,
+                     TimeProfile &profile) :
+      dslash(dslash), dslashParam(dslash.dslashParam), in(in), volume(volume), ghostFace(ghostFace), profile(profile)
     {
       if (!dslash_policy_init) {
 
@@ -1927,7 +1929,7 @@ namespace quda
                 i == QudaDslashPolicy::QUDA_SHMEM_PACKFULL_DSLASH) {
 
               auto dslashImp = DslashFactory<Dslash>::create(i);
-              (*dslashImp)(dslash, &(this->in), volume, ghostFace, profile);
+              (*dslashImp)(dslash, this->in, volume, ghostFace, profile);
 
           } else if (i == QudaDslashPolicy::QUDA_GDR_DSLASH ||
                      i == QudaDslashPolicy::QUDA_FUSED_GDR_DSLASH ||
@@ -1943,11 +1945,11 @@ namespace quda
               {
                 QudaDslashPolicy policy = DslashFactory<Dslash>::blacklist_map(i);
                 auto dslashImp = DslashFactory<Dslash>::create(policy);
-                (*dslashImp)(dslash, &(this->in), volume, ghostFace, profile);
+                (*dslashImp)(dslash, this->in, volume, ghostFace, profile);
               }
 
               auto dslashImp = DslashFactory<Dslash>::create(i);
-              (*dslashImp)(dslash, &(this->in), volume, ghostFace, profile);
+              (*dslashImp)(dslash, this->in, volume, ghostFace, profile);
 
             } else if (i != QudaDslashPolicy::QUDA_DSLASH_POLICY_DISABLED) {
               errorQuda("Unsupported dslash policy %d\n", static_cast<int>(i));
@@ -1981,7 +1983,7 @@ namespace quda
      dslashParam.remote_write = (p2p_policies[tp.aux.y] == QudaP2PPolicy::QUDA_P2P_REMOTE_WRITE ? 1 : 0); // set whether we are using remote packing writes or copy engines
 
      auto dslashImp = DslashFactory<Dslash>::create(static_cast<QudaDslashPolicy>(tp.aux.x));
-     (*dslashImp)(dslash, &in, volume, ghostFace, profile);
+     (*dslashImp)(dslash, in, volume, ghostFace, profile);
 
      // restore p2p state
      comm_enable_peer2peer(p2p_enabled);
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 9b2584ba26..069bdd7ca5 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1254,27 +1254,26 @@ namespace quda {
   }
 
   // helper for creating extended gauge fields
-  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, bool redundant_comms,
+  GaugeField *createExtendedGauge(const GaugeField &in, const lat_dim_t &R, TimeProfile &profile, bool redundant_comms,
                                   QudaReconstructType recon)
   {
     GaugeFieldParam gParamEx(in);
-    // gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
     gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
     gParamEx.pad = 0;
     gParamEx.nFace = 1;
-    gParamEx.tadpole = in.Tadpole();
-    gParamEx.anisotropy = in.Anisotropy();
     for (int d = 0; d < 4; d++) {
       gParamEx.x[d] += 2 * R[d];
       gParamEx.r[d] = R[d];
     }
-    if (recon != QUDA_RECONSTRUCT_INVALID) gParamEx.reconstruct = recon;
-    gParamEx.setPrecision(gParamEx.Precision(), true);
+    if (recon != QUDA_RECONSTRUCT_INVALID && recon != in.Reconstruct()) {
+      gParamEx.reconstruct = recon;
+      gParamEx.setPrecision(gParamEx.Precision());
+    }
 
     auto *out = new GaugeField(gParamEx);
 
     // copy input field into the extended device gauge field
-    copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu
+    copyExtendedGauge(*out, in, in.Location());
 
     // now fill up the halos
     out->exchangeExtendedGhost(R, profile, redundant_comms);
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index 69e4164c7d..6425e39cf8 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -13,7 +13,7 @@ namespace quda {
     std::vector<reduce_t>& loop_traces;
     double factor;
     const paths<1> p;
-    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for threda_array
+    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for thread_array
 
   public:
     // max block size of 8 is arbitrary for now
@@ -47,7 +47,8 @@ namespace quda {
       return (p.count * mat_mul_flops + p.num_paths * (2 * Nc + 2)) * u.Volume();
     }
 
-    long long bytes() const override {
+    long long bytes() const override
+    {
       // links * one LatticeColorMatrix worth of data
       return p.count * u.Bytes() / 4;
     }
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index 78456b665f..307d52472b 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -25,7 +25,9 @@ namespace quda {
     unsigned int sharedBytesPerThread() const
     {
       // use ThreadLocalCache if using Symanzik improvement for two Link fields
-      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
+      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ?
+                2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) :
+                0)
         + 4 * sizeof(int); // for thread_array
     }
 
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index c9576f6f01..b07a0b2c7b 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -158,6 +158,9 @@ static TimeProfile profileExtendedGauge("createExtendedGaugeField");
 //!<Profiler for computeCloverForceQuda
 static TimeProfile profileCloverForce("computeCloverForceQuda");
 
+//!< Profiles for computeTMCloverForceQuda
+static TimeProfile profileTMCloverForce("computeTMCloverForceQuda");
+
 //!<Profiler for computeStaggeredForceQuda
 static TimeProfile profileStaggeredForce("computeStaggeredForceQuda");
 
@@ -1381,6 +1384,7 @@ void endQuda(void)
     profileGaugeUpdate.Print();
     profileExtendedGauge.Print();
     profileCloverForce.Print();
+    profileTMCloverForce.Print();
     profileStaggeredForce.Print();
     profileHISQForce.Print();
     profileContract.Print();
@@ -3834,7 +3838,6 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
   gParamMom.field = &cpuMom;
   gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
 
   GaugeField cudaMom = qudaGaugeParam->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom);
@@ -4490,158 +4493,144 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
                             QudaInvertParam *inv_param)
 {
   using namespace quda;
-  auto profile = pushProfile(profileCloverForce);
+  auto profile = pushProfile(profileCloverForce, inv_param->secs, inv_param->gflops);
 
   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");
+  if (!cloverPrecise) errorQuda("No resident clover field");
 
   GaugeFieldParam fParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
   // create the host momentum field
-  fParam.location = QUDA_CPU_FIELD_LOCATION;
-  fParam.reconstruct = QUDA_RECONSTRUCT_10;
-  fParam.order = gauge_param->gauge_order;
-  GaugeField cpuMom(fParam);
+  GaugeField cpuMom = !gauge_param->use_resident_mom ? GaugeField(fParam) : GaugeField();
 
   // create the device momentum field
   fParam.location = QUDA_CUDA_FIELD_LOCATION;
-  fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.setPrecision(fParam.Precision(), true);
-  GaugeField cudaMom(fParam);
+  fParam.create = gauge_param->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  fParam.field = &cpuMom;
+  fParam.reconstruct = QUDA_RECONSTRUCT_10;
+  fParam.setPrecision(gauge_param->cuda_prec, true);
 
-  // create the device force field
-  fParam.link_type = QUDA_GENERAL_LINKS;
-  fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.reconstruct = QUDA_RECONSTRUCT_NO;
-  fParam.setPrecision(fParam.Precision(), true);
-  GaugeField cudaForce(fParam);
+  if (gauge_param->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(fParam);
+  if (gauge_param->use_resident_mom && gauge_param->overwrite_mom) cudaMom.zero();
 
-  ColorSpinorParam qParam;
-  qParam.location = QUDA_CUDA_FIELD_LOCATION;
-  qParam.nColor = 3;
-  qParam.nSpin = 4;
-  qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-  qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
-  qParam.nDim = 4;
+  if (inv_param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION)
+    errorQuda("Force computation only supports solution to MatPCDagMatPC");
+  ColorSpinorParam qParam(nullptr, *inv_param, fParam.x, false, QUDA_CUDA_FIELD_LOCATION);
   qParam.setPrecision(fParam.Precision(), fParam.Precision(), true);
-  qParam.pad = 0;
-  for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];
-
-  // create the device quark field
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
 
-  std::vector<ColorSpinorField*> quarkX, quarkP;
-  for (int i=0; i<nvector; i++) {
-    quarkX.push_back(ColorSpinorField::Create(qParam));
-    quarkP.push_back(ColorSpinorField::Create(qParam));
-  }
+  std::vector<ColorSpinorField> x(nvector), x0(nvector);
+  std::vector<double> force_coeff(nvector);
+  std::vector<array<double, 2>> ferm_epsilon(nvector);
 
-  qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-  qParam.x[0] /= 2;
-  ColorSpinorField tmp(qParam);
+  QudaParity parity = inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY;
 
-  // create the host quark field
-  qParam.location = QUDA_CPU_FIELD_LOCATION;
-  qParam.create = QUDA_REFERENCE_FIELD_CREATE;
-  qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface
+  for (int i = 0; i < nvector; i++) {
+    x[i] = ColorSpinorField(qParam);
 
-  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) ||
-    (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
-  DiracParam diracParam;
-  setDiracParam(diracParam, inv_param, pc_solve);
-  Dirac *dirac = Dirac::create(diracParam);
+    if (!inv_param->use_resident_solution) {
+      ColorSpinorParam cpuParam(h_x[i], *inv_param, fParam.x, true, inv_param->input_location);
+      ColorSpinorField cpuQuarkX(cpuParam);
+      x[i][parity] = cpuQuarkX;
+    } else {
+      x[i][parity] = solutionResident[i];
+    }
 
-  if (inv_param->use_resident_solution) {
-    if (solutionResident.size() < (unsigned int)nvector)
-      errorQuda("solutionResident.size() %lu does not match number of shifts %d",
-		solutionResident.size(), nvector);
+    force_coeff[i] = 2.0 * dt * coeff[i] * kappa2;
+    ferm_epsilon[i] = {2.0 * ck * coeff[i] * dt, -kappa2 * 2.0 * ck * coeff[i] * dt};
   }
 
-  GaugeField &gaugeEx = *extendedGaugeResident;
-
-  // create oprod and trace fields
-  fParam.geometry = QUDA_TENSOR_GEOMETRY;
-  GaugeField oprod(fParam);
-
-  std::vector<double> force_coeff(nvector);
-  // loop over different quark fields
-  for(int i=0; i<nvector; i++){
-    ColorSpinorField &x = *(quarkX[i]);
-    ColorSpinorField &p = *(quarkP[i]);
-
-    if (!inv_param->use_resident_solution) {
-      // for downloading x_e
-      qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-      qParam.x[0] /= 2;
+  if (inv_param->use_resident_solution && solutionResident.size() < (unsigned int)nvector)
+    errorQuda("solutionResident.size() %lu does not match number of shifts %d", solutionResident.size(), nvector);
 
-      // Wrap the even-parity MILC quark field
-      qParam.v = h_x[i];
-      ColorSpinorField cpuQuarkX(qParam); // create host quark field
+  // Make sure extendedGaugeResident has the correct R
+  if (extendedGaugeResident) delete extendedGaugeResident;
+  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileCloverForce);
+  GaugeField &gaugeEx = *extendedGaugeResident;
 
-      x.Even() = cpuQuarkX;
+  computeCloverForce(cudaMom, gaugeEx, *gaugePrecise, *cloverPrecise, x, x0, force_coeff, ferm_epsilon,
+                     2.0 * ck * multiplicity * dt, false, *inv_param);
 
-      gamma5(x.Even(), x.Even());
-    } else {
-      x.Even() = solutionResident[i];
-    }
+  // copy the outer product field back to the host
+  if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
+  if (gauge_param->make_resident_mom && gauge_param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
+}
 
-    dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY);
-    dirac->M(p.Even(), x.Even());
-    dirac->Dagger(QUDA_DAG_YES);
-    dirac->Dslash(p.Odd(), p.Even(), QUDA_ODD_PARITY);
-    dirac->Dagger(QUDA_DAG_NO);
+void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector,
+                              QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio)
+{
+  using namespace quda;
+  auto profile = pushProfile(profileTMCloverForce, inv_param->secs, inv_param->gflops);
 
-    gamma5(x, x);
-    gamma5(p, p);
+  checkGaugeParam(gauge_param);
+  if (!gaugePrecise) errorQuda("No resident gauge field");
+  if (!cloverPrecise) errorQuda("No resident clover field");
 
-    force_coeff[i] = 2.0*dt*coeff[i]*kappa2;
-  }
+  double kappa = inv_param->kappa;
+  double k_csw_ov_8 = kappa * inv_param->clover_csw / 8.0;
 
-  computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
+  GaugeFieldParam gParamMom(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
+  GaugeField cpuMom = !gauge_param->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
-  // In double precision the clover derivative is faster with no reconstruct
-  GaugeField *u = &gaugeEx;
-  if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
-    GaugeFieldParam param(gaugeEx);
-    param.reconstruct = QUDA_RECONSTRUCT_NO;
-    u = new GaugeField(param);
-    u -> copy(gaugeEx);
-  }
+  // create the device momentum field
+  gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
+  gParamMom.create = gauge_param->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  gParamMom.field = &cpuMom;
+  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
+  gParamMom.setPrecision(gauge_param->cuda_prec, true);
 
-  computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt);
+  if (gauge_param->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField gpuMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom);
+  if (gauge_param->use_resident_mom && gauge_param->overwrite_mom) gpuMom.zero();
 
-  /* Now the U dA/dU terms */
-  std::vector< std::vector<double> > ferm_epsilon(nvector);
-  for (int shift = 0; shift < nvector; shift++) {
-    ferm_epsilon[shift].reserve(2);
-    ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt;
-    ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt;
-  }
+  if (inv_param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION)
+    errorQuda("Force computation only supports solution to MatPCDagMatPC");
+  ColorSpinorParam qParam(nullptr, *inv_param, gParamMom.x, false, QUDA_CUDA_FIELD_LOCATION);
+  qParam.setPrecision(gauge_param->cuda_prec, gauge_param->cuda_prec, true);
+  qParam.create = QUDA_NULL_FIELD_CREATE;
+  qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
 
-  computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
+  std::vector<ColorSpinorField> x(nvector), x0(nvector);
+  std::vector<double> force_coeff(nvector);
+  std::vector<array<double, 2>> ferm_epsilon(nvector);
 
-  GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
+  QudaParity parity = inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ? QUDA_EVEN_PARITY : QUDA_ODD_PARITY;
 
-  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
-  cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
+  for (int i = 0; i < nvector; i++) {
+    x[i] = ColorSpinorField(qParam);
+    ColorSpinorParam cpuParam(h_x[i], *inv_param, gParamMom.x, true, inv_param->input_location);
+    ColorSpinorField cpuQuarkX(cpuParam);
+    x[i][parity] = cpuQuarkX; // in tmLQCD-parlance this is the odd part of X
 
-  if (u != &gaugeEx) delete u;
+    if (detratio) {
+      x0[i] = ColorSpinorField(qParam);
+      ColorSpinorParam cpuParam0(h_x0[i], *inv_param, gParamMom.x, true, inv_param->input_location);
+      ColorSpinorField cpuQuarkX0(cpuParam0);
+      x0[i][parity] = cpuQuarkX0;
+    }
 
-  updateMomentum(cudaMom, -1.0, cudaForce, "clover");
+    force_coeff[i] = 1.0 * coeff[i];
+    ferm_epsilon[i] = {k_csw_ov_8 * coeff[i], k_csw_ov_8 * coeff[i] / (kappa * kappa)};
+  }
 
-  // copy the outer product field back to the host
-  cpuMom.copy(cudaMom);
+  // Make sure extendedGaugeResident has the correct R
+  if (extendedGaugeResident) delete extendedGaugeResident;
+  extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileTMCloverForce);
+  GaugeField &gaugeEx = *extendedGaugeResident;
 
-  for (int i=0; i<nvector; i++) {
-    delete quarkX[i];
-    delete quarkP[i];
-  }
+  computeCloverForce(gpuMom, gaugeEx, *gaugePrecise, *cloverPrecise, x, x0, force_coeff, ferm_epsilon,
+                     k_csw_ov_8 * 32.0, detratio, *inv_param);
 
-#if 0
-  if (inv_param->use_resident_solution) solutionResident.clear();
-#endif
-  delete dirac;
+  if (gauge_param->return_result_mom) cpuMom.copy(gpuMom);
+  if (gauge_param->make_resident_mom && gauge_param->use_resident_mom)
+    std::exchange(momResident, gpuMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
 }
 
 void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 29528e0829..6b4c5e669e 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -333,7 +333,7 @@ namespace quda {
     initGhostFaceBuffer = false;
   }
 
-  void LatticeField::createComms(bool no_comms_fill)
+  void LatticeField::createComms(bool no_comms_fill) const
   {
     destroyComms(); // if we are requesting a new number of faces destroy and start over
 
@@ -394,7 +394,7 @@ namespace quda {
     initComms = true;
   }
 
-  void LatticeField::destroyComms()
+  void LatticeField::destroyComms() const
   {
     if (Location() != QUDA_CUDA_FIELD_LOCATION) return;
 
@@ -444,7 +444,7 @@ namespace quda {
 
   }
 
-  void LatticeField::createIPCComms()
+  void LatticeField::createIPCComms() const
   {
     if ( initIPCComms && !ghost_field_reset ) return;
 
diff --git a/lib/solver.cpp b/lib/solver.cpp
index 12cce8f532..3014b4f35f 100644
--- a/lib/solver.cpp
+++ b/lib/solver.cpp
@@ -9,9 +9,7 @@
 
 namespace quda {
 
-  static void report(const char *type) {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a %s solver\n", type);
-  }
+  static void report(const char *type) { logQuda(QUDA_VERBOSE, "Creating a %s solver\n", type); }
 
   Solver::Solver(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon,
                  const DiracMatrix &matEig, SolverParam &param, TimeProfile &profile) :
@@ -284,7 +282,7 @@ namespace quda {
       deflation_space *space = reinterpret_cast<deflation_space *>(param.eig_param.preserve_deflation_space);
 
       if (space && space->evecs.size() != 0) {
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restoring deflation space of size %lu\n", space->evecs.size());
+        logQuda(QUDA_VERBOSE, "Restoring deflation space of size %lu\n", space->evecs.size());
 
         if ((!space->svd && param.eig_param.n_conv != (int)space->evecs.size())
             || (space->svd && 2 * param.eig_param.n_conv != (int)space->evecs.size()))
@@ -321,7 +319,7 @@ namespace quda {
   {
     if (deflate_init) {
       if (param.eig_param.preserve_deflation) {
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Preserving deflation space of size %lu\n", evecs.size());
+        logQuda(QUDA_VERBOSE, "Preserving deflation space of size %lu\n", evecs.size());
 
         if (param.eig_param.preserve_deflation_space) {
           deflation_space *space = reinterpret_cast<deflation_space *>(param.eig_param.preserve_deflation_space);
@@ -438,13 +436,11 @@ namespace quda {
   }
 
   void Solver::PrintStats(const char* name, int k, double r2, double b2, double hq2) {
-    if (getVerbosity() >= QUDA_VERBOSE) {
-      if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
-        printfQuda("%s: %5d iterations, <r,r> = %9.6e, |r|/|b| = %9.6e, heavy-quark residual = %9.6e\n", name, k, r2,
-                   sqrt(r2 / b2), hq2);
-      } else {
-        printfQuda("%s: %5d iterations, <r,r> = %9.6e, |r|/|b| = %9.6e\n", name, k, r2, sqrt(r2 / b2));
-      }
+    if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
+      logQuda(QUDA_VERBOSE, "%s: %5d iterations, <r,r> = %9.6e, |r|/|b| = %9.6e, heavy-quark residual = %9.6e\n", name,
+              k, r2, sqrt(r2 / b2), hq2);
+    } else {
+      logQuda(QUDA_VERBOSE, "%s: %5d iterations, <r,r> = %9.6e, |r|/|b| = %9.6e\n", name, k, r2, sqrt(r2 / b2));
     }
 
     if (std::isnan(r2) || std::isinf(r2)) errorQuda("Solver appears to have diverged");
@@ -452,26 +448,28 @@ namespace quda {
 
   void Solver::PrintSummary(const char *name, int k, double r2, double b2,
                             double r2_tol, double hq_tol) {
-    if (getVerbosity() >= QUDA_SUMMARIZE) {
-      if (param.compute_true_res) {
-	if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
-          printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e, true = %9.6e "
-                     "(requested = %9.6e), heavy-quark residual = %9.6e (requested = %9.6e)\n",
-                     name, k, sqrt(r2 / b2), param.true_res, sqrt(r2_tol / b2), param.true_res_hq, hq_tol);
-        } else {
-          printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e, true = %9.6e "
-                     "(requested = %9.6e)\n",
-                     name, k, sqrt(r2 / b2), param.true_res, sqrt(r2_tol / b2));
-        }
+    if (param.compute_true_res) {
+      if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
+        logQuda(QUDA_SUMMARIZE,
+                "%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e, true = %9.6e "
+                "(requested = %9.6e), heavy-quark residual = %9.6e (requested = %9.6e)\n",
+                name, k, sqrt(r2 / b2), param.true_res, sqrt(r2_tol / b2), param.true_res_hq, hq_tol);
       } else {
-	if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
-          printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e "
-                     "(requested = %9.6e), heavy-quark residual = %9.6e (requested = %9.6e)\n",
-                     name, k, sqrt(r2 / b2), sqrt(r2_tol / b2), param.true_res_hq, hq_tol);
-        } else {
-          printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e (requested = %9.6e)\n",
-                     name, k, sqrt(r2 / b2), sqrt(r2_tol / b2));
-        }
+        logQuda(QUDA_SUMMARIZE,
+                "%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e, true = %9.6e "
+                "(requested = %9.6e)\n",
+                name, k, sqrt(r2 / b2), param.true_res, sqrt(r2_tol / b2));
+      }
+    } else {
+      if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
+        logQuda(QUDA_SUMMARIZE,
+                "%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e "
+                "(requested = %9.6e), heavy-quark residual = %9.6e (requested = %9.6e)\n",
+                name, k, sqrt(r2 / b2), sqrt(r2_tol / b2), param.true_res_hq, hq_tol);
+      } else {
+        logQuda(QUDA_SUMMARIZE,
+                "%s: Convergence at %d iterations, L2 relative residual: iterated = %9.6e (requested = %9.6e)\n", name,
+                k, sqrt(r2 / b2), sqrt(r2_tol / b2));
       }
     }
   }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3a95e355e8..472d6f63db 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -246,6 +246,13 @@ target_link_libraries(gauge_path_test ${TEST_LIBS})
 quda_checkbuildtest(gauge_path_test QUDA_BUILD_ALL_TESTS)
 install(TARGETS gauge_path_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+if(QUDA_DIRAC_CLOVER OR QUDA_DIRAC_TWISTED_CLOVER)
+  add_executable(clover_force_test clover_force_test.cpp)
+  target_link_libraries(clover_force_test ${TEST_LIBS})
+  quda_checkbuildtest(clover_force_test QUDA_BUILD_ALL_TESTS)
+  install(TARGETS clover_force_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
 add_executable(gauge_alg_test gauge_alg_test.cpp)
 target_link_libraries(gauge_alg_test ${TEST_LIBS})
 quda_checkbuildtest(gauge_alg_test QUDA_BUILD_ALL_TESTS)
@@ -1297,6 +1304,29 @@ foreach(prec IN LISTS TEST_PRECS)
                    --dim 2 4 6 8 --prec ${prec}
                    --gtest_output=xml:gauge_path_test_${prec}.xml)
 
+
+  if(QUDA_DIRAC_CLOVER)
+    add_test(NAME clover_force_test_${prec}
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:clover_force_test> ${MPIEXEC_POSTFLAGS}
+                   --dim 2 4 6 8 --prec ${prec}
+                   --dslash-type clover --compute-clover true
+                   --solution-type mat-pc-dag-mat-pc
+                   --matpc odd-odd-asym --dagger
+                   --enable-testing true
+                   --gtest_output=xml:clover_force_test_${prec}.xml)
+   endif()
+
+  if(QUDA_DIRAC_TWISTED_CLOVER)
+    add_test(NAME tmc_clover_force_test_${prec}
+           COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:clover_force_test> ${MPIEXEC_POSTFLAGS}
+                   --dim 2 4 6 8 --prec ${prec}
+                   --dslash-type twisted-clover --compute-clover true
+                   --solution-type mat-pc-dag-mat-pc
+                   --matpc odd-odd-asym --dagger
+                   --enable-testing true
+                   --gtest_output=xml:tmc_clover_force_test_${prec}.xml)
+   endif()
+
   if(QUDA_DIRAC_STAGGERED)
     add_test(NAME unitarize_link_${prec}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:unitarize_link_test> ${MPIEXEC_POSTFLAGS}
diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp
new file mode 100644
index 0000000000..d1cbd089ce
--- /dev/null
+++ b/tests/clover_force_test.cpp
@@ -0,0 +1,233 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "clover_force_reference.h"
+#include "misc.h"
+#include <color_spinor_field.h> // convenient quark field container
+#include <command_line_params.h>
+#include <gauge_field.h>
+#include <host_utils.h>
+#include <quda.h>
+#include <gtest/gtest.h>
+
+static int force_check;
+static int path_check;
+static double force_deviation;
+QudaGaugeParam gauge_param;
+QudaInvertParam inv_param;
+quda::GaugeField gauge;
+quda::GaugeField mom;
+quda::GaugeField mom_ref;
+std::vector<char> clover;
+std::vector<char> clover_inv;
+
+void init(int argc, char **argv)
+{
+  // Set QUDA's internal parameters
+  gauge_param = newQudaGaugeParam();
+  setWilsonGaugeParam(gauge_param);
+
+  inv_param = newQudaInvertParam();
+  setInvertParam(inv_param);
+  setDims(gauge_param.X);
+
+  // Allocate host gauge field objects
+  quda::GaugeFieldParam param(gauge_param, nullptr, QUDA_SU3_LINKS);
+  param.create = QUDA_NULL_FIELD_CREATE;
+  param.order = QUDA_QDP_GAUGE_ORDER;
+  gauge = quda::GaugeField(param);
+
+  printfQuda("Randomizing gauge fields... ");
+  constructHostGaugeField(gauge, gauge_param, argc, argv);
+
+  printfQuda("Sending gauge field to GPU\n");
+  loadGaugeQuda(gauge.raw_pointer(), &gauge_param);
+
+  param.order = QUDA_MILC_GAUGE_ORDER;
+  param.link_type = QUDA_ASQTAD_MOM_LINKS;
+  param.reconstruct = QUDA_RECONSTRUCT_10;
+  param.create = QUDA_ZERO_FIELD_CREATE;
+  mom = quda::GaugeField(param);
+  mom_ref = quda::GaugeField(param);
+
+  // Allocate host side memory for clover terms if needed.
+  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+    clover.resize(V * clover_site_size * host_clover_data_type_size);
+    clover_inv.resize(V * clover_site_size * host_spinor_data_type_size);
+    compute_clover = true;
+    constructHostCloverField(clover.data(), clover_inv.data(), inv_param);
+    // Load the clover terms to the device
+    loadCloverQuda(clover.data(), clover_inv.data(), &inv_param);
+  } else {
+    errorQuda("dslash type ( dslash_type = %d ) must have the clover", dslash_type);
+  }
+}
+
+void destroy()
+{
+  gauge = {};
+  mom = {};
+  mom_ref = {};
+}
+
+using test_t = ::testing::tuple<bool, int>;
+
+std::tuple<int, double> clover_force_test(test_t param)
+{
+  bool detratio = ::testing::get<0>(param);
+  int nvector = ::testing::get<1>(param);
+
+  std::vector<quda::ColorSpinorField> out_nvector(nvector);
+  std::vector<void *> in(nvector);
+  std::vector<quda::ColorSpinorField> out_nvector0(nvector);
+  std::vector<void *> in0(nvector);
+
+  quda::ColorSpinorParam cs_param;
+  constructWilsonTestSpinorParam(&cs_param, &inv_param, &gauge_param);
+
+  quda::RNG rng(mom, 1234);
+
+  inv_param.dagger = static_cast<QudaDagType>(dagger);
+  inv_param.num_offset = nvector;
+  for (int i = 0; i < nvector; i++) {
+    // Allocate memory and set pointers
+    out_nvector[i] = quda::ColorSpinorField(cs_param);
+    spinorNoise(out_nvector[i], rng, QUDA_NOISE_GAUSS);
+    in[i] = out_nvector[i].data();
+
+    out_nvector0[i] = quda::ColorSpinorField(cs_param);
+    spinorNoise(out_nvector0[i], rng, QUDA_NOISE_GAUSS);
+    in0[i] = out_nvector0[i].data();
+  }
+
+  std::vector<double> coeff(nvector);
+  for (int i = 0; i < nvector; i++) {
+    coeff[i] = 4. * inv_param.kappa * inv_param.kappa;
+    coeff[i] += coeff[i] * (i + 1) / 10.0;
+  }
+  gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
+  gauge_param.overwrite_mom = 1;
+
+  if (getTuning() == QUDA_TUNE_YES)
+    computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param,
+                             detratio);
+
+  // Multiple execution to exclude warmup time in the first run
+  double time_sec = 0.0;
+  double gflops = 0.0;
+  for (int i = 0; i < niter; i++) {
+    computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param,
+                             detratio);
+    time_sec += inv_param.secs;
+    gflops += inv_param.gflops;
+  }
+
+  int *check_out = true ? &force_check : &path_check;
+  std::array<void *, 4> u = {gauge.data(0), gauge.data(1), gauge.data(2), gauge.data(3)};
+  if (verify_results) {
+    gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
+    mom_ref.zero();
+    TMCloverForce_reference(mom_ref.data(), in.data(), in0.data(), coeff.data(), nvector, u, clover, clover_inv,
+                            &gauge_param, &inv_param, detratio);
+    *check_out
+      = compare_floats(mom.data(), mom_ref.data(), 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
+    // if (compute_force)
+    strong_check_mom(mom.data(), mom_ref.data(), 4 * V, gauge_param.cpu_prec);
+  }
+
+  logQuda(QUDA_VERBOSE, "\nComputing momentum action\n");
+  gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
+  auto action_quda = momActionQuda(mom.data(), &gauge_param);
+  auto action_ref = mom_action(mom_ref.data(), gauge_param.cpu_prec, 4 * V);
+  force_deviation = std::abs(action_quda - action_ref) / std::abs(action_ref);
+  logQuda(QUDA_VERBOSE, "QUDA action = %e, reference = %e relative deviation = %e\n", action_quda, action_ref,
+          force_deviation);
+  printfQuda("QUDA action = %e, reference = %e relative deviation = %e\n", action_quda, action_ref, force_deviation);
+  printfQuda("Force calculation total time = %.2f ms ; overall performance : %.2f GFLOPS\n", time_sec * 1e+3,
+             gflops / time_sec);
+
+  return {force_check, force_deviation};
+}
+
+class CloverForceTest : public ::testing::TestWithParam<test_t>
+{
+protected:
+  test_t param;
+
+public:
+  CloverForceTest() : param(GetParam()) { }
+};
+
+TEST_P(CloverForceTest, verify)
+{
+  auto deviation = clover_force_test(GetParam());
+  ASSERT_EQ(std::get<0>(deviation), 1) << "CPU and QUDA force implementations do not agree";
+  ASSERT_LE(std::get<1>(deviation), getTolerance(cuda_prec))
+    << "CPU and QUDA momentum action implementations do not agree";
+}
+
+static void display_test_info()
+{
+  printfQuda("running the following test:\n");
+
+  printfQuda("link_precision           link_reconstruct           space_dim(x/y/z)              T_dimension        "
+             "Gauge_order    niter\n");
+  printfQuda("%s                       %s                         %d/%d/%d                       %d                  "
+             "%s           %d\n",
+             get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim,
+             get_gauge_order_str(QUDA_MILC_GAUGE_ORDER), niter);
+  printfQuda("Grid partition info:     X  Y  Z  T\n");
+  printfQuda("                         %d  %d  %d  %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2),
+             dimPartitioned(3));
+}
+
+std::string gettestname(::testing::TestParamInfo<test_t> param)
+{
+  std::string name;
+  if (::testing::get<0>(param.param)) name += std::string("ratio_");
+  name += std::string("nvector_") + std::to_string(::testing::get<1>(param.param));
+  return name;
+}
+
+int main(int argc, char **argv)
+{
+  // initalize google test
+  ::testing::InitGoogleTest(&argc, argv);
+  // return code for google test
+  int test_rc = 0;
+
+  // command line options
+  auto app = make_app();
+  add_clover_force_option_group(app);
+  add_testing_option_group(app);
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+
+  initComms(argc, argv, gridsize_from_cmdline);
+  initQuda(device_ordinal);
+  init(argc, argv);
+
+  display_test_info();
+
+  if (enable_testing) {
+    // Ensure gtest prints only from rank 0
+    ::testing::TestEventListeners &listeners = ::testing::UnitTest::GetInstance()->listeners();
+    if (quda::comm_rank() != 0) { delete listeners.Release(listeners.default_result_printer()); }
+
+    test_rc = RUN_ALL_TESTS();
+  } else {
+    clover_force_test({detratio, Nsrc});
+  }
+
+  destroy();
+  endQuda();
+  finalizeComms();
+  return test_rc;
+}
+
+INSTANTIATE_TEST_SUITE_P(CloverForceTest, CloverForceTest,
+                         ::testing::Combine(::testing::Values(false, true), ::testing::Values(1, 8)), gettestname);
diff --git a/tests/host_reference/CMakeLists.txt b/tests/host_reference/CMakeLists.txt
index 464b06be5c..a1fcbdacb0 100644
--- a/tests/host_reference/CMakeLists.txt
+++ b/tests/host_reference/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(
   dslash_reference.cpp
   dslash_test_helpers.cpp
   gauge_force_reference.cpp
+  clover_force_reference.cpp
   hisq_force_reference.cpp
   staggered_dslash_reference.cpp
   wilson_dslash_reference.cpp)
diff --git a/tests/host_reference/clover_force_reference.cpp b/tests/host_reference/clover_force_reference.cpp
new file mode 100644
index 0000000000..3d4c7e4e44
--- /dev/null
+++ b/tests/host_reference/clover_force_reference.cpp
@@ -0,0 +1,1072 @@
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "gauge_field.h"
+#include "quda.h"
+#include "color_spinor_field.h" // convenient quark field container
+
+#include "clover_force_reference.h"
+#include "host_utils.h"
+#include "misc.h"
+#include "dslash_reference.h"
+#include "wilson_dslash_reference.h"
+#include "gauge_force_reference.h"
+#include "host_utils.h"
+#include "gamma_reference.h"
+
+#include <Eigen/Dense>
+
+// todo pass projector
+template <typename Float> void multiplySpinorByDiracProjector(Float *res, int projIdx, const Float *spinorIn)
+{
+  for (int i = 0; i < 4 * 3 * 2; i++) res[i] = 0.0;
+
+  for (int s = 0; s < 4; s++) {
+    for (int t = 0; t < 4; t++) {
+      Float projRe = projector[projIdx][s][t][0];
+      Float projIm = projector[projIdx][s][t][1];
+
+      for (int m = 0; m < 3; m++) {
+        Float spinorRe = spinorIn[t * (3 * 2) + m * (2) + 0];
+        Float spinorIm = spinorIn[t * (3 * 2) + m * (2) + 1];
+        res[s * (3 * 2) + m * (2) + 0] += projRe * spinorRe - projIm * spinorIm;
+        res[s * (3 * 2) + m * (2) + 1] += projRe * spinorIm + projIm * spinorRe;
+      }
+    }
+  }
+}
+
+// todo pass gamma
+template <typename Float> void multiplySpinorByDiracgamma(Float *res, int gammaIdx, const Float *spinorIn)
+{
+  for (int i = 0; i < 4 * 3 * 2; i++) res[i] = 0.0;
+
+  for (int s = 0; s < 4; s++) {
+    for (int t = 0; t < 4; t++) {
+      Float projRe = local_gamma[gammaIdx][s][t][0];
+      Float projIm = local_gamma[gammaIdx][s][t][1];
+
+      for (int m = 0; m < 3; m++) {
+        Float spinorRe = spinorIn[t * (3 * 2) + m * (2) + 0];
+        Float spinorIm = spinorIn[t * (3 * 2) + m * (2) + 1];
+        res[s * (3 * 2) + m * (2) + 0] += projRe * spinorRe - projIm * spinorIm;
+        res[s * (3 * 2) + m * (2) + 1] += projRe * spinorIm + projIm * spinorRe;
+      }
+    }
+  }
+}
+template <typename sFloat, typename gFloat> void outerProdSpinTrace(gFloat *gauge, sFloat *x, sFloat *y)
+{
+
+  // outer product over color
+
+  for (int i = 0; i < 3; i++) {
+
+    for (int j = 0; j < 3; j++) {
+      gauge[j * 6 + i * 2 + 0] = x[0 * 6 + j * 2 + 0] * y[0 * 6 + i * 2 + 0];
+      gauge[j * 6 + i * 2 + 0] += x[0 * 6 + j * 2 + 1] * y[0 * 6 + i * 2 + 1];
+      gauge[j * 6 + i * 2 + 1] = x[0 * 6 + j * 2 + 1] * y[0 * 6 + i * 2 + 0];
+      gauge[j * 6 + i * 2 + 1] -= x[0 * 6 + j * 2 + 0] * y[0 * 6 + i * 2 + 1];
+      // trace over spin (manual unroll for perf)
+      // out(j, i).real(a(0, j).real() * b(0, i).real());
+      // out(j, i).real(out(j, i).real() + a(0, j).imag() * b(0, i).imag());
+      // out(j, i).imag(a(0, j).imag() * b(0, i).real());
+      // out(j, i).imag(out(j, i).imag() - a(0, j).real() * b(0, i).imag());
+
+      for (int s = 1; s < 4; s++) {
+        gauge[j * 6 + i * 2 + 0] += x[s * 6 + j * 2 + 0] * y[s * 6 + i * 2 + 0];
+        gauge[j * 6 + i * 2 + 0] += x[s * 6 + j * 2 + 1] * y[s * 6 + i * 2 + 1];
+        gauge[j * 6 + i * 2 + 1] += x[s * 6 + j * 2 + 1] * y[s * 6 + i * 2 + 0];
+        gauge[j * 6 + i * 2 + 1] -= x[s * 6 + j * 2 + 0] * y[s * 6 + i * 2 + 1];
+        //   out(j,i).real( out(j,i).real() + a(s,j).real() * b(s,i).real() );
+        //   out(j,i).real( out(j,i).real() + a(s,j).imag() * b(s,i).imag() );
+        //   out(j,i).imag( out(j,i).imag() + a(s,j).imag() * b(s,i).real() );
+        //   out(j,i).imag( out(j,i).imag() - a(s,j).real() * b(s,i).imag() );
+      }
+    }
+  }
+}
+
+template <typename gFloat> void accum_su3xsu3(gFloat *mom, gFloat *gauge, gFloat *oprod, double coeff)
+{
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[j * 6 + k * 2 + 0] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 0] -= coeff * gauge[j * 6 + k * 2 + 1] * oprod[k * 6 + i * 2 + 1];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[j * 6 + k * 2 + 1] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[j * 6 + k * 2 + 0] * oprod[k * 6 + i * 2 + 1];
+      }
+    }
+  }
+}
+
+template <typename gFloat> void mult_su3xsu3(gFloat *mom, gFloat *gauge, gFloat *oprod, double coeff)
+{
+  for (size_t i = 0; i < gauge_site_size; i++) mom[i] = 0;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[j * 6 + k * 2 + 0] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 0] -= coeff * gauge[j * 6 + k * 2 + 1] * oprod[k * 6 + i * 2 + 1];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[j * 6 + k * 2 + 1] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[j * 6 + k * 2 + 0] * oprod[k * 6 + i * 2 + 1];
+      }
+    }
+  }
+}
+
+template <typename gFloat> void mult_su3xsu3dag(gFloat *mom, gFloat *gauge, gFloat *oprod, double coeff)
+{
+  for (size_t i = 0; i < gauge_site_size; i++) mom[i] = 0;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[j * 6 + k * 2 + 0] * oprod[i * 6 + k * 2 + 0];
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[j * 6 + k * 2 + 1] * oprod[i * 6 + k * 2 + 1];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[j * 6 + k * 2 + 1] * oprod[i * 6 + k * 2 + 0];
+        mom[j * 6 + i * 2 + 1] -= coeff * gauge[j * 6 + k * 2 + 0] * oprod[i * 6 + k * 2 + 1];
+      }
+    }
+  }
+}
+template <typename gFloat> void mult_dagsu3xsu3(gFloat *mom, gFloat *gauge, gFloat *oprod, double coeff)
+{
+  for (size_t i = 0; i < gauge_site_size; i++) mom[i] = 0;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[k * 6 + j * 2 + 0] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 0] += coeff * gauge[k * 6 + j * 2 + 1] * oprod[k * 6 + i * 2 + 1];
+        mom[j * 6 + i * 2 + 1] -= coeff * gauge[k * 6 + j * 2 + 1] * oprod[k * 6 + i * 2 + 0];
+        mom[j * 6 + i * 2 + 1] += coeff * gauge[k * 6 + j * 2 + 0] * oprod[k * 6 + i * 2 + 1];
+      }
+    }
+  }
+}
+
+template <typename gFloat> void accum_su3_to_anti_hermitian(gFloat *mom, gFloat *gauge, int sign = 1)
+{
+  auto temp = (gauge[0 * 6 + 0 * 2 + 1] + gauge[1 * 6 + 1 * 2 + 1] + gauge[2 * 6 + 2 * 2 + 1]) * 0.33333333333333333;
+  mom[6] += sign * (gauge[0 * 6 + 0 * 2 + 1] - temp);
+  mom[7] += sign * (gauge[1 * 6 + 1 * 2 + 1] - temp);
+  mom[8] += sign * (gauge[2 * 6 + 2 * 2 + 1] - temp);
+  // of diag
+  mom[0] += sign * (gauge[0 * 6 + 1 * 2 + 0] - gauge[1 * 6 + 0 * 2 + 0]) * 0.5;
+  mom[1] += sign * (gauge[0 * 6 + 1 * 2 + 1] + gauge[1 * 6 + 0 * 2 + 1]) * 0.5;
+  mom[2] += sign * (gauge[0 * 6 + 2 * 2 + 0] - gauge[2 * 6 + 0 * 2 + 0]) * 0.5;
+  mom[3] += sign * (gauge[0 * 6 + 2 * 2 + 1] + gauge[2 * 6 + 0 * 2 + 1]) * 0.5;
+  mom[4] += sign * (gauge[1 * 6 + 2 * 2 + 0] - gauge[2 * 6 + 1 * 2 + 0]) * 0.5;
+  mom[5] += sign * (gauge[1 * 6 + 2 * 2 + 1] + gauge[2 * 6 + 1 * 2 + 1]) * 0.5;
+}
+// a= b-b^dag
+template <typename gFloat> void su3_imagx2(gFloat *a, gFloat *b)
+{
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      a[j * 6 + i * 2 + 0] = b[j * 6 + i * 2 + 0] - b[i * 6 + j * 2 + 0];
+      a[j * 6 + i * 2 + 1] = b[j * 6 + i * 2 + 1] + b[i * 6 + j * 2 + 1];
+    }
+  }
+}
+
+template <typename sFloat, typename gFloat>
+void CloverForce_kernel_host(std::array<void *, 4> gauge, void *h_mom, quda::ColorSpinorField &inA,
+                             quda::ColorSpinorField &inB, int projSign, int parity, double force_coeff)
+{
+  gFloat **gaugeFull = (gFloat **)gauge.data();
+  sFloat *spinorField = (sFloat *)inB.data();
+
+  gFloat *gaugeEven[4], *gaugeOdd[4];
+
+  sFloat *A = (sFloat *)inA.data();
+
+  for (int dir = 0; dir < 4; dir++) {
+    gaugeEven[dir] = gaugeFull[dir];
+    gaugeOdd[dir] = gaugeFull[dir] + Vh * gauge_site_size;
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < Vh; i++) {
+    // loop over the forward directions
+    for (int dir = 0; dir < 8; dir += 2) {
+      // load the gauge
+      gFloat **gaugeField = (parity ? gaugeOdd : gaugeEven);
+      gFloat *gauge = &gaugeField[dir / 2][i * (3 * 3 * 2)];
+      // load shifted spinor and project
+#ifndef MULTI_GPU
+      const sFloat *spinor = spinorNeighbor(i, dir, parity, spinorField, 1);
+#else
+      sFloat **backSpinor = (sFloat **)inB.backGhostFaceBuffer;
+      sFloat **fwdSpinor = (sFloat **)inB.fwdGhostFaceBuffer;
+      const sFloat *spinor = spinorNeighbor_mg4dir(i, dir, parity, spinorField, fwdSpinor, backSpinor, 1, 1);
+#endif
+      sFloat projectedSpinor[spinor_site_size];
+      int projIdx = 2 * (dir / 2) + (projSign + 1) / 2; //+ (dir + daggerBit) % 2;
+      multiplySpinorByDiracProjector(projectedSpinor, projIdx, spinor);
+
+      gFloat oprod[gauge_site_size];
+      outerProdSpinTrace(oprod, projectedSpinor, &A[i * spinor_site_size]);
+
+      gFloat force[gauge_site_size];
+      for (size_t j = 0; j < gauge_site_size; j++) force[j] = 0;
+      accum_su3xsu3(force, gauge, oprod, force_coeff);
+      int mu = (dir / 2);
+      gFloat *mom = (gFloat *)h_mom + (4 * (i + Vh * parity) + mu) * mom_site_size;
+      accum_su3_to_anti_hermitian(mom, force);
+    }
+  }
+}
+
+void CloverForce_reference(void *h_mom, std::array<void *, 4> gauge, std::vector<quda::ColorSpinorField> &x,
+                           std::vector<quda::ColorSpinorField> &p, std::vector<double> force_coeff)
+{
+  int dag = 1;
+  for (auto i = 0u; i < x.size(); i++) {
+    for (int parity = 0; parity < 2; parity++) {
+      quda::ColorSpinorField &inA = (parity & 1) ? x[i].Odd() : x[i].Even();
+      quda::ColorSpinorField &inB = (parity & 1) ? p[i].Even() : p[i].Odd();
+      quda::ColorSpinorField &inC = (parity & 1) ? p[i].Odd() : p[i].Even();
+      quda::ColorSpinorField &inD = (parity & 1) ? x[i].Even() : x[i].Odd();
+
+      static constexpr int nFace = 1;
+      // every time that exchange ghost is called fwdGhostFaceBuffer becomes the Ghost of the last spinor called
+      inB.exchangeGhost((QudaParity)(1 - parity), nFace, dag);
+      CloverForce_kernel_host<double, double>(gauge, h_mom, inA, inB, 1, parity, force_coeff[i]);
+      inD.exchangeGhost((QudaParity)(1 - parity), nFace, 1 - dag);
+      CloverForce_kernel_host<double, double>(gauge, h_mom, inC, inD, -1, parity, force_coeff[i]);
+    }
+  }
+}
+template <typename cFloat>
+void cloverSigmaTraceCompute_host(cFloat *oprod, cFloat *clover, double coeff, int parity, double mu2, double eps2,
+                                  bool twist)
+{
+  int nSpin = 4;
+  int nColor = 3;
+  int N = nColor * nSpin / 2;
+  int chiralBlock = N + 2 * (N - 1) * N / 2;
+
+  typedef Eigen::Matrix<std::complex<cFloat>, 3, 3> Matrix3c;
+  typedef Eigen::Matrix<std::complex<cFloat>, 6, 6> CloverM;
+
+#pragma omp parallel for
+  for (int i = 0; i < Vh; i++) {
+    cFloat A_array[72];
+    for (int chirality = 0; chirality < 2; chirality++) {
+      // the cover filed for a given chirality is stored as
+      // N real numbers: the diagonal part D
+      // Then the off diagonal part is stored in in the complex array L
+      // (----       L[0]       L[1]        L[2]         L[3]         L[4]       )
+      // (           ----       L[5=N-1]    L[6]         L[7]         L[8]       )
+      // (                      ----        L[9=2N-3]    L[10]        L[11]      )
+      // (                                  ----         L[12=3N-6]   L[13]      )
+      // (                                               ----         L[14=4N-10])
+      // (                                                            ----       )
+
+      for (int j = 0; j < 36; j++)
+        A_array[chirality * 36 + j] = clover[((parity * Vh + i) * 2 + chirality) * chiralBlock + j];
+
+      CloverM A;
+      int index = ((parity * Vh + i) * 2 + chirality) * chiralBlock;
+      // diag
+      for (int j = 0; j < 6; j++) {
+        A(j, j).real(clover[index + j]);
+        A(j, j).imag(0);
+      }
+      // off-diag
+      for (int row = 0; row < 6; row++) {
+        for (int col = (row + 1); col < 6; col++) {
+          int id = N * row - (row * (row + 1)) / 2 + (col - row - 1);
+          A(row, col).real(clover[index + 6 + id * 2]);
+          A(row, col).imag(clover[index + 6 + id * 2 + 1]);
+        }
+      }
+
+      for (int row = 0; row < 6; row++) {
+        for (int j = 0; j < row; j++) { A(row, j) = conj(A(j, row)); }
+      }
+      CloverM B = A * A;
+      for (int j = 0; j < 6; j++) B(j, j) = B(j, j) + mu2 - eps2;
+
+      B = 0.5 * B.inverse();
+      A = A * B;
+      if (twist) { A = 0.25 * A; }
+
+      for (int row = 0; row < 6; row++) {
+        A_array[chirality * 36 + row] = A(row, row).real();
+        for (int col = (row + 1); col < 6; col++) {
+          int id = N * row - (row * (row + 1)) / 2 + (col - row - 1);
+          A_array[chirality * 36 + 6 + id * 2] = A(row, col).real();
+          A_array[chirality * 36 + 6 + id * 2 + 1] = A(row, col).imag();
+        }
+      }
+    }
+    for (int mu = 0; mu < 4; mu++) {
+      for (int nu = 0; nu < mu; nu++) {
+        // oprod is stored only for nu<mu (6 indices) as
+        // (---                              )
+        // (oprod[0]  ---                    )
+        // (oprod[1]  oprod[2]  ---          )
+        // (oprod[3]  oprod[4]  oprod[5]  ---)
+        // the full lexicographic index of oprod is
+        // = reim + 2 * (x_eo / 2 + (V/2) * (color + 9 * (munu + parity * 6)))
+        // munu = (mu - 1) * mu / 2 + nu
+        // color = col_color+ row_color*Ncolor
+
+        Matrix3c mat = Matrix3c::Zero();
+        cFloat diag[2][6];
+        std::complex<cFloat> tri[2][15];
+        const int idtab[15] = {0, 1, 3, 6, 10, 2, 4, 7, 11, 5, 8, 12, 9, 13, 14};
+        std::complex<cFloat> ctmp;
+
+        for (int ch = 0; ch < 2; ++ch) {
+          // factor of two is inherent to QUDA clover storage
+          for (int i = 0; i < 6; i++) diag[ch][i] = 2.0 * A_array[ch * 36 + i];
+          for (int i = 0; i < 15; i++)
+            tri[ch][idtab[i]]
+              = std::complex<cFloat>(2.0 * A_array[ch * 36 + 6 + 2 * i], 2.0 * A_array[ch * 36 + 6 + 2 * i + 1]);
+        }
+
+        // X, Y
+        if (nu == 0) {
+          if (mu == 1) {
+            for (int j = 0; j < 3; ++j) { mat(j, j).imag(diag[0][j + 3] + diag[1][j + 3] - diag[0][j] - diag[1][j]); }
+
+            // triangular part
+            int jk = 0;
+            for (int j = 1; j < 3; ++j) {
+              int jk2 = (j + 3) * (j + 2) / 2 + 3;
+              for (int k = 0; k < j; ++k) {
+                ctmp = tri[0][jk2] + tri[1][jk2] - tri[0][jk] - tri[1][jk];
+
+                mat(j, k).real(-ctmp.imag());
+                mat(j, k).imag(ctmp.real());
+                mat(k, j).real(ctmp.imag());
+                mat(k, j).imag(ctmp.real());
+
+                jk++;
+                jk2++;
+              }
+            } // X Y
+
+          } else if (mu == 2) {
+
+            for (int j = 0; j < 3; ++j) {
+              int jk = (j + 3) * (j + 2) / 2;
+              for (int k = 0; k < 3; ++k) {
+                int kj = (k + 3) * (k + 2) / 2 + j;
+                mat(j, k) = conj(tri[0][kj]) - tri[0][jk] + conj(tri[1][kj]) - tri[1][jk];
+                jk++;
+              }
+            } // X Z
+
+          } else if (mu == 3) {
+            for (int j = 0; j < 3; ++j) {
+              int jk = (j + 3) * (j + 2) / 2;
+              for (int k = 0; k < 3; ++k) {
+                int kj = (k + 3) * (k + 2) / 2 + j;
+                ctmp = conj(tri[0][kj]) + tri[0][jk] - conj(tri[1][kj]) - tri[1][jk];
+                mat(j, k).real(-ctmp.imag());
+                mat(j, k).imag(ctmp.real());
+                jk++;
+              }
+            }
+          } // mu == 3 // X T
+        } else if (nu == 1) {
+          if (mu == 2) { // Y Z
+            for (int j = 0; j < 3; ++j) {
+              int jk = (j + 3) * (j + 2) / 2;
+              for (int k = 0; k < 3; ++k) {
+                int kj = (k + 3) * (k + 2) / 2 + j;
+                ctmp = conj(tri[0][kj]) + tri[0][jk] + conj(tri[1][kj]) + tri[1][jk];
+                mat(j, k).real(ctmp.imag());
+                mat(j, k).imag(-ctmp.real());
+                jk++;
+              }
+            }
+          } else if (mu == 3) { // Y T
+            for (int j = 0; j < 3; ++j) {
+              int jk = (j + 3) * (j + 2) / 2;
+              for (int k = 0; k < 3; ++k) {
+                int kj = (k + 3) * (k + 2) / 2 + j;
+                mat(j, k) = conj(tri[0][kj]) - tri[0][jk] - conj(tri[1][kj]) + tri[1][jk];
+                jk++;
+              }
+            }
+          } // mu == 3
+        }   // nu == 1
+        else if (nu == 2) {
+          if (mu == 3) {
+            for (int j = 0; j < 3; ++j) { mat(j, j).imag(diag[0][j] - diag[0][j + 3] - diag[1][j] + diag[1][j + 3]); }
+            int jk = 0;
+            for (int j = 1; j < 3; ++j) {
+              int jk2 = (j + 3) * (j + 2) / 2 + 3;
+              for (int k = 0; k < j; ++k) {
+                ctmp = tri[0][jk] - tri[0][jk2] - tri[1][jk] + tri[1][jk2];
+                mat(j, k).real(-ctmp.imag());
+                mat(j, k).imag(ctmp.real());
+
+                mat(k, j).real(ctmp.imag());
+                mat(k, j).imag(ctmp.real());
+                jk++;
+                jk2++;
+              }
+            }
+          }
+        }
+
+        mat *= coeff;
+        // arg.output((mu-1)*mu/2 + nu, x, arg.parity) = mat;
+
+        int munu = (mu - 1) * mu / 2 + nu;
+        for (int ci = 0; ci < nColor; ci++) {   // row
+          for (int cj = 0; cj < nColor; cj++) { // col
+            int color = ci * nColor + cj;
+            int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6)));
+            oprod[id + 0] += mat(ci, cj).real();
+            oprod[id + 1] += mat(ci, cj).imag();
+          }
+        }
+
+      } // nu
+    }   // mu
+  }
+}
+
+void computeCloverSigmaTrace_reference(void *oprod, void *clover, double coeff, int parity, double mu2, double eps2,
+                                       bool twist)
+{
+
+  // FIXME: here call the appropriate template function according to gauge_precision
+  cloverSigmaTraceCompute_host((double *)oprod, (double *)clover, coeff, parity, mu2, eps2, twist);
+}
+
+template <typename gFloat>
+void get_su3FromOprod(gFloat *oprod_out, gFloat *oprod, int munu, size_t nbr_idx, const lattice_t &lat)
+{
+  int x_cb = nbr_idx % (lat.volume_ex / 2);
+  int OddBit = nbr_idx / (lat.volume_ex / 2);
+
+  for (int i = 0; i < 3; i++) {   // col
+    for (int j = 0; j < 3; j++) { // row
+      int color = i + j * 3;
+      int id = 2 * (x_cb + (lat.volume_ex / 2) * (color + 9 * (munu + OddBit * 6)));
+      oprod_out[j * 6 + i * 2 + 0] = oprod[id + 0];
+      oprod_out[j * 6 + i * 2 + 1] = oprod[id + 1];
+    }
+  }
+}
+
+template <typename gFloat>
+void computeForce_reference(void *h_mom_, void **gauge_ex, lattice_t lat, void *oprod_, int i, int yIndex, int parity,
+                            int mu, int nu)
+{
+  gFloat *oprod = (gFloat *)oprod_;
+
+  int acc_parity = yIndex == 0 ? parity : 1 - parity;
+  gFloat *mom = (gFloat *)h_mom_ + (4 * (i + Vh * acc_parity) + mu) * mom_site_size;
+
+  gFloat **gaugeFull_ex = (gFloat **)gauge_ex;
+
+  int otherparity = (1 - parity);
+  const int tidx = mu > nu ? (mu - 1) * mu / 2 + nu : (nu - 1) * nu / 2 + mu;
+  gFloat su3tmp1[gauge_site_size], su3tmp2[gauge_site_size];
+
+  if (yIndex == 0) { // do "this" force
+
+    // U[mu](x) U[nu](x+mu) U[*mu](x+nu) U[*nu](x) Oprod(x)
+    {
+      int d[4] = {0, 0, 0, 0};
+      int nbr_idx;
+      int eo_full_id = i + parity * Vh;
+      // load U(x)_(+mu)
+      // Link U1 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U1 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+
+      // load U(x+mu)_(+nu)
+      d[mu]++;
+      // Link U2 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U2 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[mu]--;
+
+      // load U(x+nu)_(+mu)
+      d[nu]++;
+      // Link U3 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U3 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+      d[nu]--;
+
+      // load U(x)_(+nu)
+      // Link U4 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U4 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+
+      // load Oprod
+      // Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod1[gauge_site_size];
+      get_su3FromOprod(Oprod1, oprod, tidx, nbr_idx, lat);
+
+      // if (nu < mu)
+      //   force -= U1 * U2 * conj(U3) * conj(U4) * Oprod1;
+      // else
+      //   force += U1 * U2 * conj(U3) * conj(U4) * Oprod1;
+      mult_dagsu3xsu3(su3tmp1, U4, Oprod1, 1);
+      mult_dagsu3xsu3(su3tmp2, U3, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, U2, su3tmp2, 1);
+      mult_su3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+
+      d[mu]++;
+      d[nu]++;
+      // Link Oprod2 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod2[gauge_site_size];
+      get_su3FromOprod(Oprod2, oprod, tidx, nbr_idx, lat);
+      d[mu]--;
+      d[nu]--;
+
+      // if (nu < mu)
+      //   force -= U1 * U2 * Oprod2 * conj(U3) * conj(U4);
+      // else
+      //   force += U1 * U2 * Oprod2 * conj(U3) * conj(U4);
+      mult_su3xsu3(su3tmp1, U4, U3, 1);
+      mult_su3xsu3dag(su3tmp2, Oprod2, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, U2, su3tmp2, 1);
+      mult_su3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+    }
+
+    {
+      int d[4] = {0, 0, 0, 0};
+      int nbr_idx;
+      int eo_full_id = i + parity * Vh;
+
+      // load U(x-nu)(+nu)
+      d[nu]--;
+      // Link U1 = arg.gauge(nu, linkIndexShift(x, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U1 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[nu]++;
+
+      // load U(x-nu)(+mu)
+      d[nu]--;
+      // Link U2 = arg.gauge(mu, linkIndexShift(x, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U2 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      d[mu]++;
+      d[nu]--;
+      // Link U3 = arg.gauge(nu, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U3 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[mu]--;
+      d[nu]++;
+
+      // load U(x)_(+mu)
+      // Link U4 = arg.gauge(mu, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U4 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+
+      d[mu]++;
+      d[nu]--;
+      // Link Oprod1 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod1[gauge_site_size];
+      get_su3FromOprod(Oprod1, oprod, tidx, nbr_idx, lat);
+      d[mu]--;
+      d[nu]++;
+
+      // if (nu < mu)
+      //   force += conj(U1) * U2 * Oprod1 * U3 * conj(U4);
+      // else
+      //   force -= conj(U1) * U2 * Oprod1 * U3 * conj(U4);
+      mult_su3xsu3dag(su3tmp1, U3, U4, 1);
+      mult_su3xsu3(su3tmp2, Oprod1, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, U2, su3tmp2, 1);
+      mult_dagsu3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+
+      // Link Oprod4 = arg.oprod(tidx, linkIndexShift(x, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod4[gauge_site_size];
+      get_su3FromOprod(Oprod4, oprod, tidx, nbr_idx, lat);
+
+      // if (nu < mu)
+      //   force += Oprod4 * conj(U1) * U2 * U3 * conj(U4);
+      // else
+      //   force -= Oprod4 * conj(U1) * U2 * U3 * conj(U4);
+      mult_su3xsu3dag(su3tmp1, U3, U4, 1);
+      mult_su3xsu3(su3tmp2, U2, su3tmp1, 1);
+      mult_dagsu3xsu3(su3tmp1, U1, su3tmp2, 1);
+      mult_su3xsu3(su3tmp2, Oprod4, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+    }
+
+  } else { // else do other force
+
+    {
+      int d[4] = {0, 0, 0, 0};
+      int nbr_idx;
+      int eo_full_id = i + otherparity * Vh;
+      // load U(x)_(+mu)
+      // Link U1 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U1 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+
+      // load U(x+mu)_(+nu)
+      d[mu]++;
+      // Link U2 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U2 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[mu]--;
+
+      // // load U(x+nu)_(+mu)
+      d[nu]++;
+      // Link U3 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U3 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+      d[nu]--;
+
+      // // load U(x)_(+nu)
+      // Link U4 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U4 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      // // load opposite parity Oprod
+      d[nu]++;
+      // Link Oprod3 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod3[gauge_site_size];
+      get_su3FromOprod(Oprod3, oprod, tidx, nbr_idx, lat);
+      d[nu]--;
+
+      // if (nu < mu)
+      //   force -= U1 * U2 * conj(U3) * Oprod3 * conj(U4);
+      // else
+      //   force += U1 * U2 * conj(U3) * Oprod3 * conj(U4);
+      mult_su3xsu3dag(su3tmp1, Oprod3, U4, 1);
+      mult_dagsu3xsu3(su3tmp2, U3, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, U2, su3tmp2, 1);
+      mult_su3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+
+      // load Oprod(x+mu)
+      d[mu]++;
+      // Link Oprod4 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod4[gauge_site_size];
+      get_su3FromOprod(Oprod4, oprod, tidx, nbr_idx, lat);
+      d[mu]--;
+
+      // if (nu < mu)
+      //   force -= U1 * Oprod4 * U2 * conj(U3) * conj(U4);
+      // else
+      //   force += U1 * Oprod4 * U2 * conj(U3) * conj(U4);
+      // below we implemented force +-=U1 * Oprod4 * U2 * conj( U4 * U3);
+      mult_su3xsu3(su3tmp1, U4, U3, 1);
+      mult_su3xsu3dag(su3tmp2, U2, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, Oprod4, su3tmp2, 1);
+      mult_su3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+    }
+
+    {
+      int d[4] = {0, 0, 0, 0};
+      int nbr_idx;
+      int eo_full_id = i + otherparity * Vh;
+
+      // load U(x-nu)(+nu)
+      d[nu]--;
+      // Link U1 = arg.gauge(nu, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U1 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[nu]++;
+
+      // load U(x-nu)(+mu)
+      d[nu]--;
+      // Link U2 = arg.gauge(mu, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U2 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+      d[nu]++;
+
+      // load U(x+mu-nu)(nu)
+      d[mu]++;
+      d[nu]--;
+      // Link U3 = arg.gauge(nu, linkIndexShift(y, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U3 = gaugeFull_ex[nu] + nbr_idx * (3 * 3 * 2);
+      d[mu]--;
+      d[nu]++;
+
+      // load U(x)_(+mu)
+      // Link U4 = arg.gauge(mu, linkIndexShift(y, d, arg.E), otherparity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat *U4 = gaugeFull_ex[mu] + nbr_idx * (3 * 3 * 2);
+
+      // load Oprod(x+mu)
+      d[mu]++;
+      // Link Oprod1 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod1[gauge_site_size];
+      get_su3FromOprod(Oprod1, oprod, tidx, nbr_idx, lat);
+      d[mu]--;
+
+      // if (nu < mu)
+      //   force += conj(U1) * U2 * U3 * Oprod1 * conj(U4);
+      // else
+      //   force -= conj(U1) * U2 * U3 * Oprod1 * conj(U4);
+      mult_su3xsu3dag(su3tmp1, Oprod1, U4, 1);
+      mult_su3xsu3(su3tmp2, U3, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, U2, su3tmp2, 1);
+      mult_dagsu3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+
+      d[nu]--;
+      // Link Oprod2 = arg.oprod(tidx, linkIndexShift(y, d, arg.E), arg.parity);
+      nbr_idx = gf_neighborIndexFullLattice(eo_full_id, d, lat);
+      gFloat Oprod2[gauge_site_size];
+      get_su3FromOprod(Oprod2, oprod, tidx, nbr_idx, lat);
+      d[nu]++;
+
+      // if (nu < mu)
+      //   force += conj(U1) * Oprod2 * U2 * U3 * conj(U4);
+      // else
+      //   force -= conj(U1) * Oprod2 * U2 * U3 * conj(U4);
+      mult_su3xsu3dag(su3tmp1, U3, U4, 1);
+      mult_su3xsu3(su3tmp2, U2, su3tmp1, 1);
+      mult_su3xsu3(su3tmp1, Oprod2, su3tmp2, 1);
+      mult_dagsu3xsu3(su3tmp2, U1, su3tmp1, 1);
+      if (nu < mu)
+        accum_su3_to_anti_hermitian(mom, su3tmp2);
+      else
+        accum_su3_to_anti_hermitian(mom, su3tmp2, -1);
+    }
+  }
+}
+
+void cloverDerivative_reference(void *h_mom, void **gauge, void *oprod, int parity, QudaGaugeParam &gauge_param)
+{
+  // created extended field
+  quda::lat_dim_t R;
+  for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
+
+  QudaGaugeParam param = newQudaGaugeParam();
+  setGaugeParam(param);
+  param.gauge_order = QUDA_QDP_GAUGE_ORDER;
+  param.t_boundary = QUDA_PERIODIC_T;
+
+  auto qdp_ex = quda::createExtendedGauge(gauge, param, R);
+  lattice_t lat(*qdp_ex);
+
+  quda::GaugeFieldParam gparam(gauge_param, oprod, QUDA_GENERAL_LINKS);
+  gparam.create = QUDA_REFERENCE_FIELD_CREATE;
+  gparam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gparam.geometry = QUDA_TENSOR_GEOMETRY;
+  auto oprod_ex = quda::createExtendedGauge(quda::GaugeField(gparam), R);
+
+#pragma omp parallel for
+  for (int i = 0; i < Vh; i++) {
+    for (int yIndex = 0; yIndex < 2; yIndex++) {
+      for (int mu = 0; mu < 4; mu++) {
+        for (int nu = 0; nu < 4; nu++) {
+          if (nu == mu)
+            continue;
+          else if (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION)
+            computeForce_reference<double>(h_mom, (void **)qdp_ex->raw_pointer(), lat, oprod_ex->data(), i, yIndex,
+                                           parity, mu, nu);
+          else if (gauge_param.cpu_prec == QUDA_SINGLE_PRECISION)
+            computeForce_reference<float>(h_mom, (void **)qdp_ex->raw_pointer(), lat, oprod_ex->data(), i, yIndex,
+                                          parity, mu, nu);
+          else
+            errorQuda("Unsupported precision %d", gauge_param.cpu_prec);
+        }
+      }
+    }
+  }
+
+  delete oprod_ex;
+  delete qdp_ex;
+}
+
+template <typename sFloat, typename gFloat>
+void CloverSigmaOprod_reference(void *oprod_, quda::ColorSpinorField &inp, quda::ColorSpinorField &inx,
+                                std::vector<double> &coeff)
+{
+  int nColor = 3;
+  gFloat *oprod = (gFloat *)oprod_;
+  sFloat *x = (sFloat *)inx.data();
+  sFloat *p = (sFloat *)inp.data();
+
+  for (int parity = 0; parity < 2; parity++) {
+#pragma omp parallel for
+    for (int i = 0; i < Vh; i++) {
+      for (int mu = 1; mu < 4; mu++) {
+        for (int nu = 0; nu < mu; nu++) {
+
+          sFloat temp[spinor_site_size], temp_munu[spinor_site_size], temp_numu[spinor_site_size];
+          multiplySpinorByDiracgamma(temp, nu, &p[spinor_site_size * (i + Vh * parity)]);
+          multiplySpinorByDiracgamma(temp_munu, mu, temp);
+
+          multiplySpinorByDiracgamma(temp, mu, &p[spinor_site_size * (i + Vh * parity)]);
+          multiplySpinorByDiracgamma(temp_numu, nu, temp);
+          for (int s = 0; s < 4; s++) {
+            for (int t = 0; t < 3; t++) {
+              temp[s * (3 * 2) + t * (2) + 0]
+                = -temp_munu[s * (3 * 2) + t * (2) + 0] + temp_numu[s * (3 * 2) + t * (2) + 0];
+              temp[s * (3 * 2) + t * (2) + 1]
+                = -temp_munu[s * (3 * 2) + t * (2) + 1] + temp_numu[s * (3 * 2) + t * (2) + 1];
+            }
+          }
+
+          gFloat oprod_f[gauge_site_size];
+          gFloat oprod_imx2[gauge_site_size];
+          outerProdSpinTrace(oprod_f, temp, &x[spinor_site_size * (i + Vh * parity)]);
+          su3_imagx2(oprod_imx2, oprod_f);
+
+          int munu = (mu - 1) * mu / 2 + nu;
+
+          for (int ci = 0; ci < nColor; ci++) {   // row
+            for (int cj = 0; cj < nColor; cj++) { // col
+              int color = ci * nColor + cj;
+              int id = 2 * (i + Vh * (color + 9 * (munu + parity * 6)));
+              oprod[id + 0] += coeff[parity] * oprod_imx2[color * 2 + 0] / 2.0;
+              oprod[id + 1] += coeff[parity] * oprod_imx2[color * 2 + 1] / 2.0;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void computeCloverSigmaOprod_reference(void *oprod, std::vector<quda::ColorSpinorField> &p,
+                                       std::vector<quda::ColorSpinorField> &x,
+                                       std::vector<std::vector<double>> &ferm_epsilon, QudaGaugeParam &gauge_param)
+{
+  for (auto i = 0u; i < x.size(); i++) {
+    if (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION)
+      CloverSigmaOprod_reference<double, double>(oprod, p[i], x[i], ferm_epsilon[i]);
+    else if (gauge_param.cpu_prec == QUDA_SINGLE_PRECISION)
+      CloverSigmaOprod_reference<float, float>(oprod, p[i], x[i], ferm_epsilon[i]);
+    else
+      errorQuda("Unsupported precision %d", gauge_param.cpu_prec);
+  }
+}
+void Gamma5_host(double *out, double *in, const int V)
+{
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
+    for (int c = 0; c < 3; c++) {
+      for (int reim = 0; reim < 2; reim++) {
+        out[i * 24 + 0 * 6 + c * 2 + reim] = in[i * 24 + 0 * 6 + c * 2 + reim];
+        out[i * 24 + 1 * 6 + c * 2 + reim] = in[i * 24 + 1 * 6 + c * 2 + reim];
+        out[i * 24 + 2 * 6 + c * 2 + reim] = -in[i * 24 + 2 * 6 + c * 2 + reim];
+        out[i * 24 + 3 * 6 + c * 2 + reim] = -in[i * 24 + 3 * 6 + c * 2 + reim];
+      }
+    }
+  }
+}
+void axpbyz_host(double a, double *x, double b, double *y, double *z, const int V)
+{
+#pragma omp parallel for
+  for (int i = 0; i < V * 24; i++) { z[i] = a * x[i] + b * y[i]; }
+}
+
+void Gamma5_host_UKQCD(double *out, double *in, const int V)
+{
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
+    for (int c = 0; c < 3; c++) {
+      for (int reim = 0; reim < 2; reim++) {
+        out[i * 24 + 0 * 6 + c * 2 + reim] = in[i * 24 + 2 * 6 + c * 2 + reim];
+        out[i * 24 + 1 * 6 + c * 2 + reim] = in[i * 24 + 3 * 6 + c * 2 + reim];
+        out[i * 24 + 2 * 6 + c * 2 + reim] = in[i * 24 + 0 * 6 + c * 2 + reim];
+        out[i * 24 + 3 * 6 + c * 2 + reim] = in[i * 24 + 1 * 6 + c * 2 + reim];
+      }
+    }
+  }
+}
+template <typename Float> void add_mom(Float *a, Float *b, int len, double coeff)
+{
+#pragma omp parallel for
+  for (int i = 0; i < len; i++) { a[i] += coeff * b[i]; }
+}
+
+template <typename Float> void set_to_zero(void *oprod_)
+{
+  Float *oprod = (Float *)oprod_;
+#pragma omp parallel for
+  for (size_t i = 0; i < V * 6 * gauge_site_size; i++) oprod[i] = 0;
+}
+
+void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector,
+                             std::array<void *, 4> &gauge, std::vector<char> &clover, std::vector<char> &clover_inv,
+                             QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio)
+{
+  if (inv_param->matpc_type != QUDA_MATPC_ODD_ODD_ASYMMETRIC)
+    errorQuda("Preconditioned operator type %d not supported by test code", inv_param->matpc_type);
+  if (inv_param->dagger != QUDA_DAG_YES) errorQuda("Test code presently requires dagger option");
+
+  quda::ColorSpinorParam qParam;
+  inv_param->solution_type = QUDA_MATDAG_MAT_SOLUTION; // set to full solution for field creation
+  constructWilsonTestSpinorParam(&qParam, inv_param, gauge_param);
+  inv_param->solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; // restore to single parity
+
+  std::vector<quda::ColorSpinorField> x(nvector), p(nvector), x0(nvector);
+  for (int i = 0; i < nvector; i++) {
+    x[i] = quda::ColorSpinorField(qParam);
+    p[i] = quda::ColorSpinorField(qParam);
+    if (detratio) x0[i] = quda::ColorSpinorField(qParam);
+  }
+
+  qParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
+  qParam.x[0] /= 2;
+  quda::ColorSpinorField tmp(qParam);
+
+  qParam.create = QUDA_REFERENCE_FIELD_CREATE;
+  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+
+  for (int i = 0; i < nvector; i++) {
+    qParam.v = h_x[i];
+    quda::ColorSpinorField load_half(qParam);
+    x[i].Odd() = load_half;
+
+    Gamma5_host(tmp.data<double *>(), x[i].Odd().data<double *>(), x[i].Odd().VolumeCB());
+
+    int parity = 0;
+    QudaMatPCType myMatPCType = inv_param->matpc_type;
+
+    if (myMatPCType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || myMatPCType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
+
+      if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        tmc_dslash(x[i].Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
+                   inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec,
+                   *gauge_param);
+      } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_dslash(x[i].Even().data(), gauge.data(), clover_inv.data(), tmp.data(), parity, QUDA_DAG_YES,
+                      inv_param->cpu_prec, *gauge_param);
+      } else {
+        errorQuda("TMCloverForce_reference: dslash_type not supported\n");
+      }
+      Gamma5_host(x[i].Even().data<double *>(), x[i].Even().data<double *>(), x[i].Even().VolumeCB());
+
+      if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        tmc_matpc(p[i].Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa,
+                  inv_param->mu, inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
+      } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_matpc(p[i].Odd().data(), gauge.data(), clover.data(), clover_inv.data(), tmp.data(), inv_param->kappa,
+                     myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
+      } else {
+        errorQuda("TMCloverForce_reference: dslash_type not supported\n");
+      }
+
+      if (detratio) {
+        qParam.v = h_x0[i];
+        quda::ColorSpinorField load_half(qParam);
+        x0[i].Odd() = load_half;
+        axpbyz_host(1, p[i].Odd().data<double *>(), 1, x0[i].Odd().data<double *>(), p[i].Odd().data<double *>(),
+                    p[i].Odd().VolumeCB());
+      }
+
+      if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
+        tmc_dslash(p[i].Even().data(), gauge.data(), p[i].Odd().data(), clover.data(), clover_inv.data(),
+                   inv_param->kappa, inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO,
+                   inv_param->cpu_prec, *gauge_param);
+      } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
+        clover_dslash(p[i].Even().data(), gauge.data(), clover_inv.data(), p[i].Odd().data(), parity, QUDA_DAG_NO,
+                      inv_param->cpu_prec, *gauge_param);
+      } else {
+        errorQuda("TMCloverForce_reference: dslash_type not supported\n");
+      }
+
+    } else {
+      errorQuda("TMCloverForce_reference: MATPC type not supported\n");
+    }
+
+    Gamma5_host(p[i].Even().data<double *>(), p[i].Even().data<double *>(), p[i].Even().VolumeCB());
+    Gamma5_host(p[i].Odd().data<double *>(), p[i].Odd().data<double *>(), p[i].Odd().VolumeCB());
+  }
+  std::vector<double> force_coeff(nvector);
+  for (int i = 0; i < nvector; i++) { force_coeff[i] = 1.0 * coeff[i]; }
+  quda::GaugeFieldParam momparam(*gauge_param);
+  // momparam.order = QUDA_QDP_GAUGE_ORDER;
+  momparam.location = QUDA_CPU_FIELD_LOCATION;
+  momparam.order = QUDA_MILC_GAUGE_ORDER;
+  momparam.reconstruct = QUDA_RECONSTRUCT_10;
+  momparam.link_type = QUDA_ASQTAD_MOM_LINKS;
+  momparam.create = QUDA_ZERO_FIELD_CREATE;
+  quda::GaugeField mom(momparam);
+  createMomCPU(mom.data(), gauge_param->cpu_prec, 0.0);
+  void *refmom = mom.data();
+
+  // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus  deriv_Sb(EO,...) in tmLQCD
+  CloverForce_reference(refmom, gauge, x, p, force_coeff);
+
+  // create oprod and trace field
+  std::vector<char> oprod_(V * 6 * gauge_site_size * host_gauge_data_type_size);
+  void *oprod = oprod_.data();
+
+  if (gauge_param->cpu_prec == QUDA_DOUBLE_PRECISION)
+    set_to_zero<double>(oprod);
+  else if (gauge_param->cpu_prec == QUDA_SINGLE_PRECISION)
+    set_to_zero<float>(oprod);
+  else
+    errorQuda("precision not valid");
+
+  double k_csw_ov_8 = inv_param->kappa * inv_param->clover_csw / 8.0;
+  size_t twist_flavor = inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH ? inv_param->twist_flavor : QUDA_TWIST_NO;
+  double mu2
+    = twist_flavor != QUDA_TWIST_NO ? 4. * inv_param->kappa * inv_param->kappa * inv_param->mu * inv_param->mu : 0.0;
+  double eps2 = twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ?
+    4.0 * inv_param->kappa * inv_param->kappa * inv_param->epsilon * inv_param->epsilon :
+    0.0;
+
+  // derivative of the determinant of the sw term, second term of (A12) in hep-lat/0112051,  sw_deriv(EE, mnl->mu) in tmLQCD
+  if (!detratio) computeCloverSigmaTrace_reference(oprod, clover.data(), k_csw_ov_8 * 32.0, 0, mu2, eps2, twist_flavor);
+
+  std::vector<std::vector<double>> ferm_epsilon(nvector);
+  for (int i = 0; i < nvector; i++) {
+    ferm_epsilon[i].reserve(2);
+    ferm_epsilon[i][0] = k_csw_ov_8 * coeff[i];
+    ferm_epsilon[i][1] = k_csw_ov_8 * coeff[i] / (inv_param->kappa * inv_param->kappa);
+  }
+  // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051,  sw_spinor_eo(EE,..) plus
+  // sw_spinor_eo(OO,..)  in tmLQCD
+  computeCloverSigmaOprod_reference(oprod, p, x, ferm_epsilon, *gauge_param);
+
+  // oprod = (A12) of hep-lat/0112051
+  // compute the insertion of oprod in Fig.27 of hep-lat/0112051
+  cloverDerivative_reference(refmom, gauge.data(), oprod, QUDA_ODD_PARITY, *gauge_param);
+  cloverDerivative_reference(refmom, gauge.data(), oprod, QUDA_EVEN_PARITY, *gauge_param);
+
+  add_mom((double *)h_mom, (double *)mom.data(), 4 * V * mom_site_size, -1.0);
+}
diff --git a/tests/host_reference/clover_force_reference.h b/tests/host_reference/clover_force_reference.h
new file mode 100644
index 0000000000..e698ffe99e
--- /dev/null
+++ b/tests/host_reference/clover_force_reference.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <array>
+#include <vector>
+#include "quda.h"
+
+void TMCloverForce_reference(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector,
+                             std::array<void *, 4> &gauge, std::vector<char> &clover, std::vector<char> &clover_inv,
+                             QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio);
diff --git a/tests/host_reference/clover_reference.cpp b/tests/host_reference/clover_reference.cpp
index fc09346960..1a89ead931 100644
--- a/tests/host_reference/clover_reference.cpp
+++ b/tests/host_reference/clover_reference.cpp
@@ -21,6 +21,7 @@ template <typename sFloat, typename cFloat> void cloverReference(sFloat *out, cF
   int N = nColor * nSpin / 2;
   int chiralBlock = N + 2 * (N - 1) * N / 2;
 
+#pragma omp parallel for
   for (int i = 0; i < Vh; i++) {
     std::complex<sFloat> *In = reinterpret_cast<std::complex<sFloat> *>(&in[i * nSpin * nColor * 2]);
     std::complex<sFloat> *Out = reinterpret_cast<std::complex<sFloat> *>(&out[i * nSpin * nColor * 2]);
@@ -86,7 +87,6 @@ void clover_dslash(void *out, void **gauge, void *clover, void *in, int parity,
 void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void *in, double kappa,
                   QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   double kappa2 = -kappa * kappa;
   void *tmp = safe_malloc(Vh * spinor_site_size * precision);
 
@@ -143,7 +143,6 @@ void clover_matpc(void *out, void **gauge, void *clover, void *clover_inv, void
 void clover_mat(void *out, void **gauge, void *clover, void *in, double kappa, int dagger, QudaPrecision precision,
                 QudaGaugeParam &gauge_param)
 {
-
   void *tmp = safe_malloc(V * spinor_site_size * precision);
 
   void *inEven = in;
@@ -171,6 +170,7 @@ void applyTwist(void *out, void *in, void *tmpH, double a, QudaPrecision precisi
 {
   switch (precision) {
   case QUDA_DOUBLE_PRECISION:
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++)
       for (int s = 0; s < 4; s++) {
         double a5 = ((s / 2) ? -1.0 : +1.0) * a;
@@ -183,6 +183,7 @@ void applyTwist(void *out, void *in, void *tmpH, double a, QudaPrecision precisi
       }
     break;
   case QUDA_SINGLE_PRECISION:
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++)
       for (int s = 0; s < 4; s++) {
         float a5 = ((s / 2) ? -1.0 : +1.0) * a;
@@ -312,18 +313,18 @@ void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, dou
   void *tmp1 = safe_malloc(Vh * spinor_site_size * precision);
   void *tmp2 = safe_malloc(Vh * spinor_site_size * precision);
 
-  if (dagger) {
-    twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1 - parity, QUDA_TWIST_GAMMA5_INVERSE,
-                      precision);
-    if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
-      wil_dslash(tmp2, gauge, tmp1, parity, dagger, precision, param);
-      twistCloverGamma5(out, tmp2, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
-    } else {
-      wil_dslash(out, gauge, tmp1, parity, dagger, precision, param);
-    }
-  } else {
+  if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
     wil_dslash(tmp1, gauge, in, parity, dagger, precision, param);
     twistCloverGamma5(out, tmp1, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
+  } else {
+    if (dagger) {
+      twistCloverGamma5(tmp1, in, clover, cInv, dagger, kappa, mu, flavor, 1 - parity, QUDA_TWIST_GAMMA5_INVERSE,
+                        precision);
+      wil_dslash(out, gauge, tmp1, parity, dagger, precision, param);
+    } else {
+      wil_dslash(tmp1, gauge, in, parity, dagger, precision, param);
+      twistCloverGamma5(out, tmp1, clover, cInv, dagger, kappa, mu, flavor, parity, QUDA_TWIST_GAMMA5_INVERSE, precision);
+    }
   }
 
   host_free(tmp2);
@@ -334,7 +335,6 @@ void tmc_dslash(void *out, void **gauge, void *in, void *clover, void *cInv, dou
 void tmc_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, QudaTwistFlavorType flavor,
              int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   void *tmp = safe_malloc(V * spinor_site_size * precision);
 
   void *inEven = in;
@@ -363,7 +363,6 @@ void tmc_matpc(void *out, void **gauge, void *in, void *clover, void *cInv, doub
                QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
                QudaGaugeParam &gauge_param)
 {
-
   double kappa2 = -kappa * kappa;
 
   void *tmp1 = safe_malloc(Vh * spinor_site_size * precision);
@@ -627,7 +626,6 @@ void tmc_ndeg_matpc(void *out, void **gauge, void *in, void *clover, void *cInv,
 void cloverHasenbuchTwist_mat(void *out, void **gauge, void *clover, void *in, double kappa, double mu, int dagger,
                               QudaPrecision precision, QudaGaugeParam &gauge_param, QudaMatPCType matpc_type)
 {
-
   // out = CloverMat in
   clover_mat(out, gauge, clover, in, kappa, dagger, precision, gauge_param);
 
@@ -681,7 +679,6 @@ void cloverHasenbuschTwist_matpc(void *out, void **gauge, void *in, void *clover
                                  QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
                                  QudaGaugeParam &gauge_param)
 {
-
   clover_matpc(out, gauge, clover, cInv, in, kappa, matpc_type, dagger, precision, gauge_param);
 
   if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) {
diff --git a/tests/host_reference/gamma_reference.h b/tests/host_reference/gamma_reference.h
new file mode 100644
index 0000000000..ab747a14bf
--- /dev/null
+++ b/tests/host_reference/gamma_reference.h
@@ -0,0 +1,80 @@
+// FIXME: this was copied from  wilson_dslash_reference.cpp maybe it is better to create a separate file with the projection
+// clang-format off
+static const double projector[8][4][4][2] = {
+  {
+    {{1,0}, {0,0}, {0,0}, {0,-1}},
+    {{0,0}, {1,0}, {0,-1}, {0,0}},
+    {{0,0}, {0,1}, {1,0}, {0,0}},
+    {{0,1}, {0,0}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {0,0}, {0,1}},
+    {{0,0}, {1,0}, {0,1}, {0,0}},
+    {{0,0}, {0,-1}, {1,0}, {0,0}},
+    {{0,-1}, {0,0}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {0,0}, {1,0}},
+    {{0,0}, {1,0}, {-1,0}, {0,0}},
+    {{0,0}, {-1,0}, {1,0}, {0,0}},
+    {{1,0}, {0,0}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {0,0}, {-1,0}},
+    {{0,0}, {1,0}, {1,0}, {0,0}},
+    {{0,0}, {1,0}, {1,0}, {0,0}},
+    {{-1,0}, {0,0}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {0,-1}, {0,0}},
+    {{0,0}, {1,0}, {0,0}, {0,1}},
+    {{0,1}, {0,0}, {1,0}, {0,0}},
+    {{0,0}, {0,-1}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {0,1}, {0,0}},
+    {{0,0}, {1,0}, {0,0}, {0,-1}},
+    {{0,-1}, {0,0}, {1,0}, {0,0}},
+    {{0,0}, {0,1}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {-1,0}, {0,0}},
+    {{0,0}, {1,0}, {0,0}, {-1,0}},
+    {{-1,0}, {0,0}, {1,0}, {0,0}},
+    {{0,0}, {-1,0}, {0,0}, {1,0}}
+  },
+  {
+    {{1,0}, {0,0}, {1,0}, {0,0}},
+    {{0,0}, {1,0}, {0,0}, {1,0}},
+    {{1,0}, {0,0}, {1,0}, {0,0}},
+    {{0,0}, {1,0}, {0,0}, {1,0}}
+  }
+};
+
+static const double local_gamma[4][4][4][2] = {
+  {// x
+    {{0, 0}, {0, 0}, {0, 0}, {0, -1}},
+    {{0, 0}, {0, 0}, {0, -1}, {0, 0}},
+    {{0, 0}, {0, 1}, {0, 0}, {0, 0}},
+    {{0, 1}, {0, 0}, {0, 0}, {0, 0}}
+  },
+  {// Y
+    {{0, 0}, {0, 0}, {0, 0}, {1, 0}},
+    {{0, 0}, {0, 0}, {-1, 0}, {0, 0}},
+    {{0, 0}, {-1, 0}, {0, 0}, {0, 0}},
+    {{1, 0}, {0, 0}, {0, 0}, {0, 0}}
+  },
+  {// Z
+    {{0, 0}, {0, 0}, {0, -1}, {0, 0}},
+    {{0, 0}, {0, 0}, {0, 0}, {0, 1}},
+    {{0, 1}, {0, 0}, {0, 0}, {0, 0}},
+    {{0, 0}, {0, -1}, {0, 0}, {0, 0}}
+  },
+  {// T
+    {{0, 0}, {0, 0}, {-1, 0}, {0, 0}},
+    {{0, 0}, {0, 0}, {0, 0}, {-1, 0}},
+    {{-1, 0}, {0, 0}, {0, 0}, {0, 0}},
+    {{0, 0}, {-1, 0}, {0, 0}, {0, 0}}
+  }
+};
+// clang-format on
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index 0f27fa9572..ce285a6b99 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -4,8 +4,6 @@
 #include <string.h>
 #include <type_traits>
 
-#include "quda.h"
-#include "gauge_field.h"
 #include "host_utils.h"
 #include "misc.h"
 #include "gauge_force_reference.h"
@@ -119,25 +117,7 @@ struct danti_hermitmat {
   double space;
 };
 
-// convenience struct for passing around lattice meta data
-struct lattice_t {
-  int n_color;
-  size_t volume;
-  size_t volume_ex;
-  int x[4];
-  int r[4];
-  int e[4];
 
-  lattice_t(const quda::GaugeField &lat) : n_color(lat.Ncolor()), volume(1), volume_ex(lat.Volume())
-  {
-    for (int d = 0; d < 4; d++) {
-      x[d] = lat.X()[d] - 2 * lat.R()[d];
-      r[d] = lat.R()[d];
-      e[d] = lat.X()[d];
-      volume *= x[d];
-    }
-  };
-};
 
 extern int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1);
 
diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h
index 9b6d06a555..5331b99386 100644
--- a/tests/host_reference/gauge_force_reference.h
+++ b/tests/host_reference/gauge_force_reference.h
@@ -1,4 +1,28 @@
 #pragma once
+#include "quda.h"
+#include <comm_quda.h>
+#include "gauge_field.h"
+// convenience struct for passing around lattice meta data
+struct lattice_t {
+  int n_color;
+  size_t volume;
+  size_t volume_ex;
+  int x[4];
+  int r[4];
+  int e[4];
+
+  lattice_t(const quda::GaugeField &lat) : n_color(lat.Ncolor()), volume(1), volume_ex(lat.Volume())
+  {
+    for (int d = 0; d < 4; d++) {
+      x[d] = lat.X()[d] - 2 * lat.R()[d];
+      r[d] = lat.R()[d];
+      e[d] = lat.X()[d];
+      volume *= x[d];
+    }
+  };
+};
+
+int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat);
 
 #include <gauge_field.h>
 
diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp
index 471f79c38d..dd6f025001 100644
--- a/tests/host_reference/wilson_dslash_reference.cpp
+++ b/tests/host_reference/wilson_dslash_reference.cpp
@@ -12,62 +12,10 @@
 
 #include <dslash_reference.h>
 #include <string.h>
+#include "gamma_reference.h"
 
 using namespace quda;
 
-// clang-format off
-static const double projector[8][4][4][2] = {
-  {
-    {{1,0}, {0,0}, {0,0}, {0,-1}},
-    {{0,0}, {1,0}, {0,-1}, {0,0}},
-    {{0,0}, {0,1}, {1,0}, {0,0}},
-    {{0,1}, {0,0}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {0,0}, {0,1}},
-    {{0,0}, {1,0}, {0,1}, {0,0}},
-    {{0,0}, {0,-1}, {1,0}, {0,0}},
-    {{0,-1}, {0,0}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {0,0}, {1,0}},
-    {{0,0}, {1,0}, {-1,0}, {0,0}},
-    {{0,0}, {-1,0}, {1,0}, {0,0}},
-    {{1,0}, {0,0}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {0,0}, {-1,0}},
-    {{0,0}, {1,0}, {1,0}, {0,0}},
-    {{0,0}, {1,0}, {1,0}, {0,0}},
-    {{-1,0}, {0,0}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {0,-1}, {0,0}},
-    {{0,0}, {1,0}, {0,0}, {0,1}},
-    {{0,1}, {0,0}, {1,0}, {0,0}},
-    {{0,0}, {0,-1}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {0,1}, {0,0}},
-    {{0,0}, {1,0}, {0,0}, {0,-1}},
-    {{0,-1}, {0,0}, {1,0}, {0,0}},
-    {{0,0}, {0,1}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {-1,0}, {0,0}},
-    {{0,0}, {1,0}, {0,0}, {-1,0}},
-    {{-1,0}, {0,0}, {1,0}, {0,0}},
-    {{0,0}, {-1,0}, {0,0}, {1,0}}
-  },
-  {
-    {{1,0}, {0,0}, {1,0}, {0,0}},
-    {{0,0}, {1,0}, {0,0}, {1,0}},
-    {{1,0}, {0,0}, {1,0}, {0,0}},
-    {{0,0}, {1,0}, {0,0}, {1,0}}
-  }
-};
-// clang-format on
-
 // todo pass projector
 template <typename Float> void multiplySpinorByDiracProjector(Float *res, int projIdx, const Float *spinorIn)
 {
@@ -111,6 +59,7 @@ void dslashReference(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int o
     gaugeOdd[dir] = gaugeFull[dir] + Vh * gauge_site_size;
   }
 
+#pragma omp parallel for
   for (int i = 0; i < Vh; i++) {
     for (int dir = 0; dir < 8; dir++) {
       gFloat *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd, 1);
@@ -150,6 +99,7 @@ void dslashReference(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloa
     ghostGaugeOdd[dir] = ghostGauge[dir] + (faceVolume[dir] / 2) * gauge_site_size;
   }
 
+#pragma omp parallel for
   for (int i = 0; i < Vh; i++) {
 
     for (int dir = 0; dir < 8; dir++) {
@@ -247,7 +197,6 @@ template <typename sFloat>
 void twistGamma5(sFloat *out, sFloat *in, const int dagger, const sFloat kappa, const sFloat mu,
                  const QudaTwistFlavorType flavor, const int V, QudaTwistGamma5Type twist)
 {
-
   sFloat a = 0.0, b = 0.0;
   if (twist == QUDA_TWIST_GAMMA5_DIRECT) { // applying the twist
     a = 2.0 * kappa * mu * flavor;         // mu already includes the flavor
@@ -262,6 +211,7 @@ void twistGamma5(sFloat *out, sFloat *in, const int dagger, const sFloat kappa,
 
   if (dagger) a *= -1.0;
 
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     sFloat tmp[24];
     for (int s = 0; s < 4; s++)
@@ -289,7 +239,6 @@ void twist_gamma5(void *out, void *in, int daggerBit, double kappa, double mu, Q
 void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, double mu, QudaTwistFlavorType flavor,
                int oddBit, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   if (daggerBit && (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD))
     twist_gamma5(spinorField, spinorField, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision);
 
@@ -306,7 +255,6 @@ void tm_dslash(void *res, void **gaugeFull, void *spinorField, double kappa, dou
 void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
              QudaGaugeParam &gauge_param)
 {
-
   void *inEven = in;
   void *inOdd = (char *)in + Vh * spinor_site_size * precision;
   void *outEven = out;
@@ -322,7 +270,6 @@ void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qu
 void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit,
             QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   void *inEven = in;
   void *inOdd = (char *)in + Vh * spinor_site_size * precision;
   void *outEven = out;
@@ -345,7 +292,6 @@ void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwis
 void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit,
                QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   void *tmp = safe_malloc(Vh * spinor_site_size * precision);
 
   // FIXME: remove once reference clover is finished
@@ -369,7 +315,6 @@ void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatP
 void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor,
               QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param)
 {
-
   void *tmp = safe_malloc(Vh * spinor_site_size * precision);
 
   if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
@@ -425,7 +370,6 @@ template <typename sFloat>
 void ndegTwistGamma5(sFloat *out1, sFloat *out2, sFloat *in1, sFloat *in2, const int dagger, const sFloat kappa,
                      const sFloat mu, const sFloat epsilon, const int V, QudaTwistGamma5Type twist)
 {
-
   sFloat a = 0.0, b = 0.0, d = 0.0;
   if (twist == QUDA_TWIST_GAMMA5_DIRECT) { // applying the twist
     a = 2.0 * kappa * mu;
@@ -442,6 +386,7 @@ void ndegTwistGamma5(sFloat *out1, sFloat *out2, sFloat *in1, sFloat *in2, const
 
   if (dagger) a *= -1.0;
 
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     sFloat tmp1[24];
     sFloat tmp2[24];
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index d842349f1d..ea7c779f05 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -302,6 +302,8 @@ bool   smear_delete_two_link  = true;
 
 bool enable_testing = false;
 
+bool detratio = false;
+
 namespace
 {
   CLI::TransformPairs<QudaCABasis> ca_basis_map {{"power", QUDA_POWER_BASIS}, {"chebyshev", QUDA_CHEBYSHEV_BASIS}};
@@ -1116,3 +1118,9 @@ void add_quark_smear_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--smear-nsteps", smear_n_steps, "Number of smearing steps (default 50)");
   opgroup->add_option("--smear-t0", smear_t0, "Index of the time slice (default -1)");
 }
+
+void add_clover_force_option_group(std::shared_ptr<QUDAApp> quda_app)
+{
+  auto opgroup = quda_app->add_option_group("Clover force", "Options controlling clover force testing");
+  opgroup->add_option("--determinant-ratio", detratio, "Test a ratio of determinants. Default is false");
+}
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index a5105ff5b6..fb52e5764d 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -140,6 +140,7 @@ void add_gaugefix_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_comms_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_testing_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_quark_smear_option_group(std::shared_ptr<QUDAApp> quda_app);
+void add_clover_force_option_group(std::shared_ptr<QUDAApp> quda_app);
 
 template <typename T> std::string inline get_string(CLI::TransformPairs<T> &map, T val)
 {
@@ -428,3 +429,5 @@ extern bool   smear_delete_two_link;
 extern std::array<int, 4> grid_partition;
 
 extern bool enable_testing;
+
+extern bool detratio;
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 70aea9cdc2..ecf748976f 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -179,7 +179,7 @@ void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc
   int construct_type = 0;
   if (latfile.size() > 0) {
     // load in the command line supplied gauge field using QIO and LIME
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Loading the gauge field in %s\n", latfile.c_str());
+    logQuda(QUDA_VERBOSE, "Loading the gauge field in %s\n", latfile.c_str());
     read_gauge_field(latfile.c_str(), gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
     construct_type = 2;
   } else {
@@ -191,6 +191,20 @@ void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc
   constructQudaGaugeField(gauge, construct_type, gauge_param.cpu_prec, &gauge_param);
 }
 
+void constructHostGaugeField(quda::GaugeField &gauge, QudaGaugeParam &gauge_param, int argc, char **argv)
+{
+  if (gauge.Order() == QUDA_QDP_GAUGE_ORDER) {
+    constructHostGaugeField(static_cast<void **>(gauge.raw_pointer()), gauge_param, argc, argv);
+  } else {
+    GaugeFieldParam param(gauge);
+    param.order = QUDA_QDP_GAUGE_ORDER;
+    param.create = QUDA_NULL_FIELD_CREATE;
+    GaugeField u(param);
+    constructHostGaugeField(static_cast<void **>(u.raw_pointer()), gauge_param, argc, argv);
+    gauge = u;
+  }
+}
+
 void constructHostCloverField(void *clover, void *, QudaInvertParam &inv_param)
 {
   double norm = 0.01; // clover components are random numbers in the range (-norm, norm)
@@ -1454,8 +1468,10 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase)
 
 void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase)
 {
-  void *link[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
-  createSiteLinkCPU(link, precision, phase);
+  if (u.Order() == QUDA_QDP_GAUGE_ORDER)
+    createSiteLinkCPU(static_cast<void **>(u.raw_pointer()), precision, phase);
+  else
+    errorQuda("Unsupported gauge order %d", u.Order());
 }
 
 template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
@@ -1515,16 +1531,16 @@ static int compare_link(void **linkA, void **linkB, int len, QudaPrecision preci
   return ret;
 }
 
-static int compare_link(const GaugeField &linkA, const GaugeField &linkB)
+static int compare_link(const GaugeField &a, const GaugeField &b)
 {
+  if (a.Order() != QUDA_QDP_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", a.Order());
   int ret;
-
-  void *a[] = {linkA.data(0), linkA.data(1), linkA.data(2), linkA.data(3)};
-  void *b[] = {linkB.data(0), linkB.data(1), linkB.data(2), linkB.data(3)};
-  if (checkPrecision(linkA, linkB) == QUDA_DOUBLE_PRECISION) {
-    ret = compareLink((double **)a, (double **)b, linkA.Volume());
+  if (checkPrecision(a, b) == QUDA_DOUBLE_PRECISION) {
+    ret = compareLink(reinterpret_cast<double **>(a.raw_pointer()), reinterpret_cast<double **>(b.raw_pointer()),
+                      a.Volume());
   } else {
-    ret = compareLink((float **)a, (float **)b, linkA.Volume());
+    ret = compareLink(reinterpret_cast<float **>(a.raw_pointer()), reinterpret_cast<float **>(b.raw_pointer()),
+                      a.Volume());
   }
 
   return ret;
@@ -1566,6 +1582,7 @@ int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *
 
 int strong_check_link(const GaugeField &linkA, const std::string &msgA, const GaugeField &linkB, const std::string &msgB)
 {
+  if (linkA.Order() != QUDA_QDP_GAUGE_ORDER) errorQuda("Unsupported gauge order %d", linkA.Order());
   if (verbosity >= QUDA_VERBOSE) {
     printfQuda("%s\n", msgA.c_str());
     printLinkElement(linkA.data(0), 0, prec);
@@ -1587,7 +1604,7 @@ int strong_check_link(const GaugeField &linkA, const std::string &msgA, const Ga
   return compare_link(linkA, linkB);
 }
 
-void createMomCPU(void *mom, QudaPrecision precision)
+void createMomCPU(void *mom, QudaPrecision precision, double max_val)
 {
   size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
   void *temp = safe_malloc(4 * V * gauge_site_size * gSize);
@@ -1597,7 +1614,7 @@ void createMomCPU(void *mom, QudaPrecision precision)
       for (int dir = 0; dir < 4; dir++) {
         double *thismom = (double *)mom;
         for (auto k = 0lu; k < mom_site_size; k++) {
-          thismom[(4 * i + dir) * mom_site_size + k] = 1.0 * rand() / RAND_MAX;
+          thismom[(4 * i + dir) * mom_site_size + k] = max_val * rand() / RAND_MAX;
           if (k == mom_site_size - 1) thismom[(4 * i + dir) * mom_site_size + k] = 0.0;
         }
       }
@@ -1605,7 +1622,7 @@ void createMomCPU(void *mom, QudaPrecision precision)
       for (int dir = 0; dir < 4; dir++) {
         float *thismom = (float *)mom;
         for (auto k = 0lu; k < mom_site_size; k++) {
-          thismom[(4 * i + dir) * mom_site_size + k] = 1.0 * rand() / RAND_MAX;
+          thismom[(4 * i + dir) * mom_site_size + k] = max_val * rand() / RAND_MAX;
           if (k == mom_site_size - 1) thismom[(4 * i + dir) * mom_site_size + k] = 0.0;
         }
       }
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index b788cc3081..9465da0d41 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -92,6 +92,7 @@ void setQudaDefaultMgTestParams();
 //------------------------------------------------------
 void constructQudaGaugeField(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param);
 void constructHostGaugeField(void **gauge, QudaGaugeParam &gauge_param, int argc, char **argv);
+void constructHostGaugeField(quda::GaugeField &gauge, QudaGaugeParam &gauge_param, int argc, char **argv);
 void constructHostCloverField(void *clover, void *clover_inv, QudaInvertParam &inv_param);
 void constructQudaCloverField(void *clover, double norm, double diag, QudaPrecision precision);
 template <typename Float> void constructCloverField(Float *res, double norm, double diag);
@@ -195,7 +196,7 @@ int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec);
  */
 double mom_action(void *mom, QudaPrecision prec, int len);
 
-void createMomCPU(void *mom, QudaPrecision precision);
+void createMomCPU(void *mom, QudaPrecision precision, double max_val = 1.0);
 
 /**
    @brief Create four Staggered spinor fields, whose outer product is used for momentum calculations