diff --git a/include/blas_3d.h b/include/blas_3d.h
new file mode 100644
index 0000000000..2616a8ac03
--- /dev/null
+++ b/include/blas_3d.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <quda_internal.h>
+#include <color_spinor_field.h>
+
+namespace quda
+{
+
+  namespace blas3d
+  {
+
+    // Local enum for the 3D copy type
+    enum class copyType { COPY_TO_3D, COPY_FROM_3D, SWAP_3D };
+
+    /**
+       @brief Extract / insert / swap a timeslice between a 4-d field and a 3-d field
+       @param[in] slice Which slice
+       @param[in] type Extracting a time slice (COPY_TO_3D) or
+       inserting a timeslice (COPY_FROM_3D) or swapping time slices
+       (SWAP_3D)
+       @param[in,out] x 3-d field
+       @param[in,out] y 4-d field
+    */
+    void copy(int slice, copyType type, ColorSpinorField &x, ColorSpinorField &y);
+
+    /**
+       @brief Swap the slice in two given fields
+       @param[in] slice The slice we wish to swap in the fields
+       @param[in,out] x Field whose slice we wish to swap
+       @param[in,out] y Field whose slice we wish to swap
+     */
+    void swap(int slice, ColorSpinorField &x, ColorSpinorField &y);
+
+    /**
+       @brief Compute a set of real-valued inner products <x, y>, where each inner
+       product is restricted to a timeslice.
+       @param[out] result Vector of spatial inner products
+       @param[in] x Left vector field
+       @param[in] y Right vector field
+     */
+    void reDotProduct(std::vector<double> &result, const ColorSpinorField &x, const ColorSpinorField &y);
+
+    /**
+       @brief Compute a set of complex-valued inner products <x, y>, where each inner
+       product is restricted to a timeslice.
+       @param[out] result Vector of spatial inner products
+       @param[in] x Left vector field
+       @param[in] y Right vector field
+     */
+    void cDotProduct(std::vector<Complex> &result, const ColorSpinorField &a, const ColorSpinorField &b);
+
+    /**
+       @brief Timeslice real-valued scaling of the field
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Field we we wish to scale
+     */
+    void ax(std::vector<double> &a, ColorSpinorField &x);
+
+    /**
+       @brief Timeslice real-valued axpby computation
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Input field
+       @param[in] b Vector of scale factors (length = local temporary extent)
+       @param[in,out] y Field we are updating
+     */
+    void axpby(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b,
+               ColorSpinorField &y);
+
+    /**
+       @brief Timeslice complex-valued axpby computation
+       @param[in] a Vector of scale factors (length = local temporary extent)
+       @param[in] x Input field
+       @param[in] b Vector of scale factors (length = local temporary extent)
+       @param[in,out] y Field we are updating
+     */
+    void caxpby(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y);
+
+  } // namespace blas3d
+} // namespace quda
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 059329507b..d8980afc75 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -280,6 +280,8 @@ namespace quda
       //! for deflation etc.
       if (is_composite) printfQuda("Number of elements = %d\n", composite_dim);
     }
+
+    void change_dim(int D, int d) { x[D] = d; }
   };
 
   struct DslashConstant;
@@ -1054,6 +1056,38 @@ namespace quda
   */
   void spinorDistanceReweight(ColorSpinorField &src, double alpha0, int t0);
 
+  /**
+     @brief Helper function for determining if the spin of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @return If spin is unique return the number of spins
+   */
+  inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b)
+  {
+    int nSpin = 0;
+    if (a.Nspin() == b.Nspin())
+      nSpin = a.Nspin();
+    else
+      errorQuda("Spin %d %d do not match (%s:%d in %s())", a.Nspin(), b.Nspin(), file, line, func);
+    return nSpin;
+  }
+
+  /**
+     @brief Helper function for determining if the spin of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @param[in] args List of additional fields to check spin on
+     @return If spins is unique return the number of spins
+   */
+  template <typename... Args>
+  inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b,
+                   const Args &...args)
+  {
+    return Spin_(func, file, line, a, b) & Spin_(func, file, line, a, args...);
+  }
+
+#define checkSpin(...) Spin_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
   /**
      @brief Helper function for determining if the preconditioning
      type of the fields is the same.
diff --git a/include/comm_key.h b/include/comm_key.h
index 6618068bb0..a9682ddc03 100644
--- a/include/comm_key.h
+++ b/include/comm_key.h
@@ -1,27 +1,33 @@
 #pragma once
 
+#include <array.h>
+
 namespace quda
 {
 
   struct CommKey {
 
     static constexpr int n_dim = 4;
+    array<int, n_dim> key = {0, 0, 0, 0};
 
-    int array[n_dim] = {0, 0, 0, 0};
+    constexpr int product() { return key[0] * key[1] * key[2] * key[3]; }
 
-    constexpr inline int product() { return array[0] * array[1] * array[2] * array[3]; }
+    constexpr int &operator[](int d) { return key[d]; }
 
-    constexpr inline int &operator[](int d) { return array[d]; }
+    constexpr const int &operator[](int d) const { return key[d]; }
 
-    constexpr inline const int &operator[](int d) const { return array[d]; }
+    constexpr auto data() { return key.data; }
 
-    constexpr inline int *data() { return array; }
+    constexpr auto data() const { return key.data; }
 
-    constexpr inline const int *data() const { return array; }
+    constexpr bool is_valid() const { return (key[0] > 0) && (key[1] > 0) && (key[2] > 0) && (key[3] > 0); }
 
-    constexpr inline bool is_valid() const
+    bool operator==(const CommKey &other) const
     {
-      return (array[0] > 0) && (array[1] > 0) && (array[2] > 0) && (array[3] > 0);
+      bool is_same = true;
+      for (auto i = 0; i < n_dim; i++)
+        if (key[i] != other.key[i]) is_same = false;
+      return is_same;
     }
   };
 
diff --git a/include/comm_quda.h b/include/comm_quda.h
index 7d3bdfde19..39563c84f9 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -50,12 +50,26 @@ namespace quda
   int comm_dim(int dim);
 
   /**
-     Return the coording of this process in the dimension dim
+     Return the global number of processes in the dimension dim
+     @param dim Dimension which we are querying
+     @return Length of process dimensions
+  */
+  int comm_dim_global(int dim);
+
+  /**
+     Return the coordinate of this process in the dimension dim
      @param dim Dimension which we are querying
      @return Coordinate of this process
   */
   int comm_coord(int dim);
 
+  /**
+     Return the global coordinates of this process in the dimension dim
+     @param dim Dimension which we are querying
+     @return Coordinate of this process
+  */
+  int comm_coord_global(int dim);
+
   /**
    * Declare a message handle for sending `nbytes` to the `rank` with `tag`.
    */
diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index 38efe07e6c..32111e92f9 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -773,9 +773,12 @@ namespace quda
      @param[in] a Scale factor applied to derivative
      @param[in] b Scale factor applied to aux field
      @param[in] x Vector field we accumulate onto to
+     @param[in] parity Destination parity
+     @param[in] comm_override Override for which dimensions are partitioned
+     @param[in] profile The TimeProfile used for profiling the dslash
   */
   void ApplyLaplace(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
-                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
                     const int *comm_override, TimeProfile &profile);
 
   /**
diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h
index ee01a51b32..4fcea4b3d3 100644
--- a/include/eigensolve_quda.h
+++ b/include/eigensolve_quda.h
@@ -148,7 +148,7 @@ namespace quda
        @param[in] out Output spinor
        @param[in] in Input spinor
     */
-    double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in);
+    virtual double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in);
 
     /**
        @brief Orthogonalise input vectors r against
@@ -263,22 +263,10 @@ namespace quda
        @brief Compute eigenvalues and their residiua
        @param[in] mat Matrix operator
        @param[in] evecs The eigenvectors
-       @param[in] evals The eigenvalues
+       @param[in,out] evals The eigenvalues
        @param[in] size The number of eigenvalues to compute
     */
-    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals,
-                      int size);
-
-    /**
-       @brief Compute eigenvalues and their residiua.  This variant compute the number of converged eigenvalues.
-       @param[in] mat Matrix operator
-       @param[in] evecs The eigenvectors
-       @param[in] evals The eigenvalues
-    */
-    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals)
-    {
-      computeEvals(evecs, evals, n_conv);
-    }
+    virtual void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size = 0);
 
     /**
        @brief Load and check eigenpairs from file
@@ -461,6 +449,116 @@ namespace quda
     void computeBlockKeptRitz(std::vector<ColorSpinorField> &kSpace);
   };
 
+  /**
+     @brief Thick Restarted Lanczos Method for 3D slices
+  */
+  class TRLM3D : public EigenSolver
+  {
+    bool verbose_rank = false; /** Whether this rank is one that logs */
+
+  public:
+    /**
+       @brief Constructor for Thick Restarted Eigensolver class
+       @param eig_param The eigensolver parameters
+       @param mat The operator to solve
+    */
+    TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param);
+
+    /**
+       @return Whether the solver is only for Hermitian systems
+    */
+    virtual bool hermitian() override { return true; } /** TRLM3D is only for Hermitian systems */
+
+    // Variable size matrix (for the 3D problem)
+    std::vector<std::vector<double>> ritz_mat_3D;
+
+    // Arrays for 3D residua
+    std::vector<std::vector<double>> residua_3D;
+
+    // Array for convergence
+    std::vector<bool> converged_3D;
+    std::vector<bool> active_3D;
+    std::vector<int> iter_locked_3D;
+    std::vector<int> iter_keep_3D;
+    std::vector<int> iter_converged_3D;
+    std::vector<int> num_locked_3D;
+    std::vector<int> num_keep_3D;
+    std::vector<int> num_converged_3D;
+
+    // Tridiagonal/Arrow matrices, fixed size (for the 3D problem)
+    std::vector<std::vector<double>> alpha_3D;
+    std::vector<std::vector<double>> beta_3D;
+
+    // The orthogonal direction and size in the 3D problem
+    int ortho_dim;
+    int ortho_dim_size;
+
+    /**
+       @brief Compute eigenpairs
+       @param[in] kSpace Krylov vector space
+       @param[in] evals Computed eigenvalues
+    */
+    void operator()(std::vector<ColorSpinorField> &kSpace, std::vector<Complex> &evals) override;
+
+    /**
+       @brief Lanczos step: extends the Krylov space.
+       @param[in] v Vector space
+       @param[in] j Index of vector being computed
+    */
+    void lanczosStep3D(std::vector<ColorSpinorField> &v, int j);
+
+    /**
+       @brief Reorder the Krylov space by eigenvalue
+       @param[in] kSpace the Krylov space
+    */
+    void reorder3D(std::vector<ColorSpinorField> &kSpace);
+
+    /**
+       @brief Get the eigendecomposition from the arrow matrix
+    */
+    void eigensolveFromArrowMat3D();
+
+    /**
+       @brief Rotate the Ritz vectors using the arrow matrix eigendecomposition
+       @param[in] nKspace current Krylov space
+    */
+    void computeKeptRitz3D(std::vector<ColorSpinorField> &kSpace);
+
+    /**
+       @brief Orthogonalise input vectors r against
+       vector space v using block-BLAS
+       @param[in] v Vector space
+       @param[in] r Vectors to be orthogonalised
+       @param[in] j Use vectors v[0:j]
+       @param[in] s array of
+    */
+    void blockOrthogonalize3D(std::vector<ColorSpinorField> &v, std::vector<ColorSpinorField> &r, int j);
+
+    /**
+       @brief Check for an initial guess. If none present, populate with rands, then
+       orthonormalise
+       @param[in] kSpace The Krylov space vectors
+    */
+    void prepareInitialGuess3D(std::vector<ColorSpinorField> &kSpace, int ortho_dim_size);
+
+    /**
+       @brief Estimate the spectral radius of the operator for the max value of the
+       Chebyshev polynomial
+       @param[in] mat Matrix operator
+       @param[in] out Output spinor
+       @param[in] in Input spinor
+    */
+    double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) override;
+
+    /**
+       @brief Compute eigenvalues and their residiua
+       @param[in] evecs The eigenvectors
+       @param[in,out] evals The eigenvalues
+       @param[in] size The number of eigenvalues to compute
+    */
+    void computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size = 0) override;
+  };
+
   /**
      @brief Implicitly Restarted Arnoldi Method.
   */
diff --git a/include/enum_quda.h b/include/enum_quda.h
index f281d33feb..462ce98cf0 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -138,6 +138,7 @@ typedef enum QudaInverterType_s {
 typedef enum QudaEigType_s {
   QUDA_EIG_TR_LANCZOS,     // Thick restarted lanczos solver
   QUDA_EIG_BLK_TR_LANCZOS, // Block Thick restarted lanczos solver
+  QUDA_EIG_TR_LANCZOS_3D,  // Thick restarted lanczos solver for 3-d systems
   QUDA_EIG_IR_ARNOLDI,     // Implicitly Restarted Arnoldi solver
   QUDA_EIG_BLK_IR_ARNOLDI, // Block Implicitly Restarted Arnoldi solver
   QUDA_EIG_INVALID = QUDA_INVALID_ENUM
@@ -277,8 +278,6 @@ typedef enum QudaVerbosity_s {
   QUDA_INVALID_VERBOSITY = QUDA_INVALID_ENUM
 } QudaVerbosity;
 
-typedef enum QudaTune_s { QUDA_TUNE_NO, QUDA_TUNE_YES, QUDA_TUNE_INVALID = QUDA_INVALID_ENUM } QudaTune;
-
 typedef enum QudaPreserveDirac_s {
   QUDA_PRESERVE_DIRAC_NO,
   QUDA_PRESERVE_DIRAC_YES,
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 596232a7b3..8874959d7b 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -122,8 +122,9 @@
 #define QudaEigType integer(4)
 #define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver
 #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver
-#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver
-#define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented)
+#define QUDA_EIG_TR_LANCZOS_3D 2  // Thick Restarted Lanczos Solver for 3-d systems
+#define QUDA_EIG_IR_ARNOLDI 3     // Implicitly restarted Arnoldi solver
+#define QUDA_EIG_BLK_IR_ARNOLDI 4 // Block Implicitly restarted Arnoldi solver (not yet implemented)
 #define QUDA_EIG_INVALID QUDA_INVALID_ENUM
 
 #define QudaEigSpectrumType integer(4)
@@ -247,11 +248,6 @@
 #define QUDA_DEBUG_VERBOSE 3
 #define QUDA_INVALID_VERBOSITY QUDA_INVALID_ENUM
 
-#define QudaTune integer(4)
-#define QUDA_TUNE_NO 0
-#define QUDA_TUNE_YES 1
-#define QUDA_TUNE_INVALID QUDA_INVALID_ENUM
-
 #define QudaPreserveDirac integer(4)
 #define QUDA_PRESERVE_DIRAC_NO 0
 #define QUDA_PRESERVE_DIRAC_YES 1
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index 990ff949e7..cf1faf72b0 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -1106,4 +1106,26 @@ namespace quda {
     return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
   }
 
+  /**
+     @brief Compute the flattened 4-d index from a separate T and
+     flattened 3-d XYZ index.
+     @param[in] t Temporal index
+     @param[in] xyz Flattened 3-d index
+     @param[in] X Lattice dimensions
+     @return The flattened 4-d index
+   */
+  template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
+  {
+    int x[4];
+#pragma unroll
+    for (int d = 0; d < 4; d++) {
+      if (d != reduction_dim) {
+        x[d] = xyz % X[d];
+        xyz = xyz / X[d];
+      }
+    }
+    x[reduction_dim] = t;
+    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
+  }
+
 } // namespace quda
diff --git a/include/kernels/blas_3d.cuh b/include/kernels/blas_3d.cuh
new file mode 100644
index 0000000000..e7426e89af
--- /dev/null
+++ b/include/kernels/blas_3d.cuh
@@ -0,0 +1,331 @@
+#pragma once
+
+#include <color_spinor_field_order.h>
+#include <index_helper.cuh>
+#include <fast_intdiv.h>
+#include <quda_matrix.h>
+#include <matrix_field.h>
+#include <constant_kernel_arg.h>
+#include <reduce_helper.h>
+#include <kernel.h>
+
+namespace quda
+{
+
+  struct baseArg : kernel_param<> {
+    int_fastdiv X[4]; // grid dimensions
+
+    baseArg(dim3 threads, const ColorSpinorField &x) : kernel_param(threads)
+    {
+      for (int i = 0; i < 4; i++) { X[i] = x.X()[i]; }
+      if (x.SiteSubset() == 1) X[0] = 2 * X[0];
+    }
+  };
+
+  template <typename Float, int nColor_> struct copy3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F y;
+    F x;
+    const int slice;
+
+    copy3dArg(ColorSpinorField &y, ColorSpinorField &x, int slice) :
+      baseArg(dim3(y.VolumeCB(), y.SiteSubset(), 1), y), y(y), x(x), slice(slice)
+    {
+    }
+  };
+
+  template <typename Arg> struct copyTo3d {
+    const Arg &arg;
+    constexpr copyTo3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      // Isolate the slice of the 4D array
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+
+        // Get 4D data
+        Vector y = arg.y(x_cb, parity);
+        // Get 3D location
+        int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]);
+
+        // Write to 3D
+        arg.x(xyz / 2, xyz % 2) = y;
+      }
+    }
+  };
+
+  template <typename Arg> struct copyFrom3d {
+    const Arg &arg;
+    constexpr copyFrom3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      int idx[4] = {};
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+
+        // Get 3D location
+        int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]);
+        Vector x = arg.x(xyz / 2, xyz % 2);
+        // Write to 4D
+        arg.y(x_cb, parity) = x;
+      }
+    }
+  };
+
+  template <typename Arg> struct swap3d {
+    const Arg &arg;
+    constexpr swap3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      int idx[4] = {};
+      getCoords(idx, x_cb, arg.X, parity);
+
+      if (idx[3] == arg.slice) {
+        using Vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
+        Vector x = arg.x(x_cb, parity);
+        Vector y = arg.y(x_cb, parity);
+        arg.x(x_cb, parity) = y;
+        arg.y(x_cb, parity) = x;
+      }
+    }
+  };
+
+  template <typename Float, int nColor_> struct axpby3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+    static constexpr bool disable_ghost = true;
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
+
+    static constexpr int MAX_ORTHO_DIM = 256;
+    real a[MAX_ORTHO_DIM];
+    const F x;
+    real b[MAX_ORTHO_DIM];
+    F y;
+
+    axpby3dArg(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b,
+               ColorSpinorField &y) :
+      baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y)
+    {
+      if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM);
+      for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i];
+      for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i];
+    }
+  };
+
+  template <typename Arg> struct axpby3d {
+    const Arg &arg;
+    constexpr axpby3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      Vector x = arg.x(x_cb, parity);
+      Vector y = arg.y(x_cb, parity);
+
+      // Get the coeffs for this dim slice
+      real a = arg.a[idx[3]];
+      real b = arg.b[idx[3]];
+
+      // Compute the axpby
+      Vector out = a * x + b * y;
+
+      // Write to y
+      arg.y(x_cb, parity) = out;
+    }
+  };
+
+  template <typename Float, int nColor_> struct caxpby3dArg : baseArg {
+    using real = typename mapper<Float>::type;
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+    static constexpr bool disable_ghost = true;
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, disable_ghost>::type F;
+
+    static constexpr int MAX_ORTHO_DIM = 256;
+    complex<real> a[MAX_ORTHO_DIM];
+    const F x;
+    complex<real> b[MAX_ORTHO_DIM];
+    F y;
+
+    caxpby3dArg(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y) :
+      baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y)
+    {
+      if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM);
+      for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i];
+      for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i];
+    }
+  };
+
+  template <typename Arg> struct caxpby3d {
+    const Arg &arg;
+    constexpr caxpby3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      int idx[4];
+      getCoords(idx, x_cb, arg.X, parity);
+
+      Vector x = arg.x(x_cb, parity);
+      Vector y = arg.y(x_cb, parity);
+
+      // Get the coeffs for this dim slice
+      complex<real> a = arg.a[idx[3]];
+      complex<real> b = arg.b[idx[3]];
+
+      // Compute the caxpby
+      Vector out = a * x + b * y;
+
+      // Write to y
+      arg.y(x_cb, parity) = out;
+    }
+  };
+
+  template <typename Float, int nColor_> struct reDotProduct3dArg : public ReduceArg<double> {
+    using real = typename mapper<Float>::type;
+    static constexpr int reduction_dim = 3; // DMH Template this
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F x;
+    F y;
+    int_fastdiv Xh[4]; // checkerboard grid dimensions
+
+    reDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) :
+      ReduceArg<double>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]),
+                        x.X()[reduction_dim]),
+      x(x),
+      y(y)
+    {
+      for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i];
+    }
+
+    __device__ __host__ double init() const { return double(); }
+  };
+
+  template <typename Arg> struct reDotProduct3d : plus<double> {
+    using reduce_t = double;
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 2;
+    const Arg &arg;
+    constexpr reDotProduct3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      // Collect vector data
+      int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh);
+
+      Vector x = arg.x(idx_cb, parity);
+      Vector y = arg.y(idx_cb, parity);
+
+      // Get the inner product
+      reduce_t sum = innerProduct(x, y).real();
+
+      // Apply reduction to t bucket
+      return plus::operator()(sum, result);
+    }
+  };
+
+  template <typename Float, int nColor_> struct cDotProduct3dArg : public ReduceArg<array<double, 2>> {
+    using real = typename mapper<Float>::type;
+    static constexpr int reduction_dim = 3; // DMH Template this
+    static constexpr int nColor = nColor_;
+    static constexpr int nSpin = 1;
+    static constexpr bool spin_project = false;
+    static constexpr bool spinor_direct_load = false; // false means texture load
+
+    // Create a typename F for the ColorSpinorFields
+    typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;
+
+    F x;
+    F y;
+    int_fastdiv X[4];  // grid dimensions
+    int_fastdiv Xh[4]; // checkerboard grid dimensions
+
+    cDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) :
+      ReduceArg<array<double, 2>>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]),
+                                  x.X()[reduction_dim]),
+      x(x),
+      y(y)
+    {
+      for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i];
+    }
+
+    __device__ __host__ reduce_t init() const { return {0.0, 0.0}; }
+  };
+
+  template <typename Arg> struct cDotProduct3d : plus<array<double, 2>> {
+    using reduce_t = array<double, 2>;
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 2;
+
+    const Arg &arg;
+    constexpr cDotProduct3d(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t)
+    {
+      using real = typename Arg::real;
+      using Vector = ColorSpinor<real, Arg::nColor, Arg::nSpin>;
+
+      // Collect vector data
+      int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh);
+
+      Vector x = arg.x(idx_cb, parity);
+      Vector y = arg.y(idx_cb, parity);
+
+      // Get the inner product
+      complex<double> res = innerProduct(x, y);
+
+      // Apply reduction to temporal bucket
+      return plus::operator()({res.real(), res.imag()}, result);
+    }
+  };
+} // namespace quda
diff --git a/include/kernels/contraction.cuh b/include/kernels/contraction.cuh
index 96079ed041..815da471d2 100644
--- a/include/kernels/contraction.cuh
+++ b/include/kernels/contraction.cuh
@@ -32,20 +32,6 @@ namespace quda
     return ((sink[3] * X[2] + sink[2]) * X[1] + sink[1]) * X[0] + sink[0];
   }
 
-  template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
-  {
-    int x[4];
-#pragma unroll
-    for (int d = 0; d < 4; d++) {
-      if (d != reduction_dim) {
-        x[d] = xyz % X[d];
-        xyz /= X[d];
-      }
-    }
-    x[reduction_dim] = t;
-    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
-  }
-
   template <typename Float, int nColor_, int nSpin_ = 4, int reduction_dim_ = 3, typename contract_t = spinor_array>
   struct ContractionSummedArg : public ReduceArg<contract_t> {
     using reduce_t = contract_t;
diff --git a/include/kernels/copy_field_offset.cuh b/include/kernels/copy_field_offset.cuh
index ea4a603bfd..5f614b2e49 100644
--- a/include/kernels/copy_field_offset.cuh
+++ b/include/kernels/copy_field_offset.cuh
@@ -142,12 +142,12 @@ namespace quda
         if (arg.mode == QudaOffsetCopyMode::COLLECT) {
           // we are collecting so x_cb is the index for the input.
           idx_in = x_cb;
-          getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.array);
+          getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.data());
           idx_out = linkIndex(coordinate, arg.dim_out);
         } else {
           // we are dispersing so x_cb is the index for the output.
           idx_out = x_cb;
-          getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.array);
+          getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.data());
           idx_in = linkIndex(coordinate, arg.dim_in);
         }
         copy_field(s * arg.volume_4d_cb_out + idx_out, s * arg.volume_4d_cb_in + idx_in, parity, arg);
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index a68187b1d4..a756fe43c3 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -578,7 +578,8 @@ namespace quda
 	  for (int s_count = 0; s_count < arg.Ls; s_count++) {
 	    auto factorR = (s_ < s ? -arg.m_f * R : R);
 
-	    Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+	    Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+	      arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
 	    r += factorR * in.project(4, proj_dir);
 
 	    R *= coeff.kappa(s);
@@ -597,7 +598,8 @@ namespace quda
 	  for (int s_count = 0; s_count < arg.Ls; s_count++) {
 	    auto factorL = (s_ > s ? -arg.m_f * L : L);
 
-	    Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+	    Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+	      arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
 	    l += factorL * in.project(4, proj_dir);
 
 	    L *= coeff.kappa(s);
diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh
index 7bea55d908..add4b7caba 100644
--- a/include/kernels/evec_project.cuh
+++ b/include/kernels/evec_project.cuh
@@ -47,25 +47,11 @@ namespace quda {
     __device__ __host__ spinor_array init() const { return spinor_array(); }
   };
   
- template <int reduction_dim, class T> __device__ int idx_from_t_xyz(int t, int xyz, T X[4])
-  {
-    int x[4];
-#pragma unroll
-    for (int d = 0; d < 4; d++) {
-      if (d != reduction_dim) {
-	x[d] = xyz % X[d];
-	xyz = xyz / X[d];
-      }
-    }    
-    x[reduction_dim] = t;    
-    return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]);
-  }
-  
   template <typename Arg> struct EvecProjection : plus<spinor_array> {
     using reduce_t = spinor_array;
-    using plus<reduce_t>::operator();    
-    static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension
-    
+    using plus<reduce_t>::operator();
+    static constexpr int reduce_block_dim = 1; // only doing a reduce in the x thread dimension
+
     const Arg &arg;
     constexpr EvecProjection(const Arg &arg) : arg(arg) {}
     static constexpr const char *filename() { return KERNEL_FILE; }
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index 152dbbce2f..03659a722f 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -44,8 +44,8 @@ namespace quda
 
     LaplaceArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                const ColorSpinorField &halo, const GaugeField &U, int dir, double a, double b,
-               cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
+               cvector_ref<const ColorSpinorField> &x, int parity, const int *comm_override) :
+      DslashArg<Float, nDim>(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
@@ -74,7 +74,7 @@ namespace quda
      @param[in] thread_dim Which dimension this thread corresponds to (fused exterior only)
 
   */
-  template <int nParity, bool dagger, KernelType kernel_type, int dir, typename Coord, typename Arg, typename Vector>
+  template <int nParity, KernelType kernel_type, int dir, typename Coord, typename Arg, typename Vector>
   __device__ __host__ inline void applyLaplace(Vector &out, Arg &arg, Coord &coord, int parity, int, int thread_dim,
                                                bool &active, int src_idx)
   {
@@ -136,8 +136,7 @@ namespace quda
   }
 
   // out(x) = M*in
-  template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg> struct laplace :
-    dslash_default {
+  template <int nParity, bool, bool xpay, KernelType kernel_type, typename Arg> struct laplace : dslash_default {
 
     const Arg &arg;
     template <typename Ftor> constexpr laplace(const Ftor &ftor) : arg(ftor.arg) {}
@@ -164,12 +163,10 @@ namespace quda
       // case 4 is an operator in all x,y,z,t dimensions
       // case 3 is a spatial operator only, the t dimension is omitted.
       switch (arg.dir) {
-      case 3:
-        applyLaplace<nParity, dagger, mykernel_type, 3>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-        break;
+      case 3: applyLaplace<nParity, mykernel_type, 3>(out, arg, coord, parity, idx, thread_dim, active, src_idx); break;
       case 4:
       default:
-        applyLaplace<nParity, dagger, mykernel_type, -1>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+        applyLaplace<nParity, mykernel_type, -1>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
         break;
       }
 
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 462add3bde..2cf1f6beaa 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -902,6 +902,40 @@ namespace quda {
 
 #define checkPrecision(...) Precision_(__func__, __FILE__, __LINE__, __VA_ARGS__)
 
+  /**
+     @brief Helper function for determining if the color of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @return If color is unique return the number of colors
+   */
+  template <typename T1, typename T2>
+  inline int Color_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_)
+  {
+    const unwrap_t<T1> &a(a_);
+    const unwrap_t<T2> &b(b_);
+    int nColor = 0;
+    if (a.Ncolor() == b.Ncolor())
+      nColor = a.Ncolor();
+    else
+      errorQuda("Color %d %d do not match (%s:%d in %s())", a.Ncolor(), b.Ncolor(), file, line, func);
+    return nColor;
+  }
+
+  /**
+     @brief Helper function for determining if the color of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @param[in] args List of additional fields to check color on
+     @return If colors is unique return the number of colors
+   */
+  template <typename T1, typename T2, typename... Args>
+  inline int Color_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args)
+  {
+    return Color_(func, file, line, a, b) & Color_(func, file, line, a, args...);
+  }
+
+#define checkColor(...) Color_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
   /**
      @brief Helper function for determining if the field is in native order
      @param[in] a Input field
diff --git a/include/quda.h b/include/quda.h
index 155791b60a..da78a50a3b 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -9,6 +9,7 @@
  * as the Fortran interface in lib/quda_fortran.F90.
  */
 
+#include <stdbool.h> // bool support
 #include <enum_quda.h>
 #include <stdio.h> /* for FILE */
 #include <quda_define.h>
@@ -283,8 +284,6 @@ extern "C" {
     double temp;                           /**< The mean temperature of the device for the duration of the solve */
     double clock;                          /**< The mean clock frequency of the device for the duration of the solve */
 
-    QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */
-
     /** Number of steps in s-step algorithms */
     int Nsteps;
 
@@ -496,7 +495,7 @@ extern "C" {
     QudaBoolean preserve_deflation;
 
     /** This is where we store the deflation space.  This will point
-        to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this.  */
+        to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */
     void *preserve_deflation_space;
 
     /** If we restore the deflation space, this boolean indicates
@@ -506,6 +505,10 @@ extern "C" {
         false, but preserve_deflation would be true */
     QudaBoolean preserve_evals;
 
+    /** Whether to use the smeared gauge field for the Dirac operator
+        for whose eigenvalues are are computing. */
+    bool use_smeared_gauge;
+
     /** What type of Dirac operator we are using **/
     /** If !(use_norm_op) && !(use_dagger) use M. **/
     /** If use_dagger, use Mdag **/
@@ -569,6 +572,12 @@ extern "C" {
     /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/
     char QUDA_logfile[512];
 
+    /** The orthogonal direction in the 3D eigensolver **/
+    int ortho_dim;
+
+    /** The size of the orthogonal direction in the 3D eigensolver, local **/
+    int ortho_dim_size_local;
+
     //-------------------------------------------------
 
     // EIG-CG PARAMS
@@ -1485,7 +1494,7 @@ extern "C" {
    * @param inGauge Pointer to the device gauge field (QUDA device field)
    * @param param The parameters of the host and device fields
    */
-  void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
+  void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param);
 
   /**
    * Reinterpret gauge as a pointer to a GaugeField and call destructor.
diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h
index 0614b3bda3..497e2b2b59 100644
--- a/include/targets/cuda/reduce_helper.h
+++ b/include/targets/cuda/reduce_helper.h
@@ -11,7 +11,7 @@
 #include <cuda/std/climits>
 #include <cuda/std/type_traits>
 #include <cuda/std/limits>
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 using count_t = cuda::atomic<unsigned int, cuda::thread_scope_device>;
 
 namespace quda
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 03b9b148ee..b49ccc9753 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -43,6 +43,14 @@ namespace quda {
    */
   const std::map<TuneKey, TuneParam> &getTuneCache();
 
+  /**
+     @brief Unify all instances of the tunecache across ranks.  This
+     is called after returning to a global communicator.
+     @param[in] rank_list The list of ranks from whose tunecaches we
+     want to merge to form the global tunecache
+   */
+  void joinTuneCache(const std::vector<int> &rank_list);
+
   /**
      @brief Return a string encoding the QUDA version
    */
@@ -61,7 +69,7 @@ namespace quda {
 
   class Tunable {
 
-    friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);
+    friend TuneParam tuneLaunch(Tunable &, bool, QudaVerbosity);
     static inline uint64_t _flops_global = 0;
     static inline uint64_t _bytes_global = 0;
 
@@ -417,7 +425,7 @@ namespace quda {
    * @param[in] verbosity What verbosity to use during tuning?
    * @return The tuned launch parameters
    */
-  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled = getTuning(), QudaVerbosity verbosity = getVerbosity());
+  TuneParam tuneLaunch(Tunable &tunable, bool enabled = getTuning(), QudaVerbosity verbosity = getVerbosity());
 
   /**
    * @brief Post an event in the trace, recording where it was posted
diff --git a/include/util_quda.h b/include/util_quda.h
index 3d68fb5a2e..031f878762 100644
--- a/include/util_quda.h
+++ b/include/util_quda.h
@@ -20,7 +20,25 @@ namespace quda
    @brief Query whether autotuning is enabled or not.  Default is enabled but can be overridden by setting QUDA_ENABLE_TUNING=0.
    @return If autotuning is enabled
  */
-QudaTune getTuning();
+bool getTuning();
+
+/**
+   @brief Set the tuning state
+   @param[in] tuning New tuning state
+ */
+void setTuning(bool tuning);
+
+/**
+   @brief Push a new tuning state, and back up the present one on the
+   stack.
+   @param[in] tune New tuning state
+ */
+void pushTuning(bool tune);
+
+/**
+   @brief Pop the present tuning state and restore what is on the stack.
+ */
+void popTuning();
 
 QudaVerbosity getVerbosity();
 char *getOutputPrefix();
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 950ac83a09..f9c2247b33 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -20,8 +20,9 @@ set (QUDA_OBJS
   solve.cpp monitor.cpp dirac_coarse.cpp dslash_coarse.cpp
   coarse_op.cpp coarsecoarse_op.cpp
   coarse_op_preconditioned.cpp staggered_coarse_op.cpp
-  eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp
-  eigensolve_quda.cpp quda_arpack_interface.cpp
+  eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp
+  eig_trlm_3d.cpp blas_3d.cu
+  vector_io.cpp eigensolve_quda.cpp quda_arpack_interface.cpp
   multigrid.cpp transfer.cpp block_orthogonalize.cpp
   prolongator.cpp restrictor.cpp staggered_prolong_restrict.cu
   gauge_phase.cu timer.cpp
diff --git a/lib/blas_3d.cu b/lib/blas_3d.cu
new file mode 100644
index 0000000000..acab3da85e
--- /dev/null
+++ b/lib/blas_3d.cu
@@ -0,0 +1,246 @@
+#include <color_spinor_field.h>
+#include <kernels/blas_3d.cuh>
+#include <tunable_nd.h>
+#include <tunable_reduction.h>
+#include <blas_3d.h>
+#include <instantiate.h>
+
+namespace quda
+{
+
+  namespace blas3d
+  {
+
+    template <typename Float, int nColor> class copy3D : TunableKernel2D
+    {
+      ColorSpinorField &y;
+      ColorSpinorField &x;
+      const int t_slice;
+      const copyType type;
+      unsigned int minThreads() const { return y.VolumeCB(); }
+
+    public:
+      copy3D(ColorSpinorField &y, ColorSpinorField &x, int t_slice, copyType type) :
+        TunableKernel2D(y, y.SiteSubset()), y(y), x(x), t_slice(t_slice), type(type)
+      {
+        // Check slice value
+        if (t_slice < 0 || t_slice >= y.X()[3]) errorQuda("Unexpected slice %d", t_slice);
+
+        strcat(aux, type == copyType::SWAP_3D ? ",swap_3d" : type == copyType::COPY_TO_3D ? ",to_3d" : ",from_3d");
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        copy3dArg<Float, nColor> arg(y, x, t_slice);
+        switch (type) {
+        case copyType::COPY_TO_3D: launch<copyTo3d>(tp, stream, arg); break;
+        case copyType::COPY_FROM_3D: launch<copyFrom3d>(tp, stream, arg); break;
+        case copyType::SWAP_3D: launch<swap3d>(tp, stream, arg); break;
+        default: errorQuda("Unknown 3D copy type");
+        }
+      }
+
+      void preTune()
+      {
+        x.backup();
+        y.backup();
+      }
+      void postTune()
+      {
+        x.restore();
+        y.restore();
+      }
+      long long bytes() const
+      {
+        return (type == copyType::SWAP_3D ? 2 : 1) * (x.Bytes() / x.X(3) + y.Bytes() / y.X(3));
+      }
+    };
+
+    void copy(const int slice, const copyType type, ColorSpinorField &x, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+      // Check orth dim
+      if (x.X()[3] != 1) errorQuda("Unexpected dimensions in x[3]=%d", x.X()[3]);
+      // We must give a 4D Lattice field as the first argument
+      instantiate<copy3D>(y, x, slice, type);
+    }
+
+    void swap(int slice, ColorSpinorField &x, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+      instantiate<copy3D>(x, y, slice, copyType::SWAP_3D);
+    }
+
+    template <typename Float, int nColor> class axpby3D : TunableKernel2D
+    {
+      const ColorSpinorField &x;
+      ColorSpinorField &y;
+      const std::vector<double> &a;
+      const std::vector<double> &b;
+      unsigned int minThreads() const override { return x.VolumeCB(); }
+
+    public:
+      axpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector<double> &a, const std::vector<double> &b) :
+        TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream) override
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        launch<axpby3d>(tp, stream, axpby3dArg<Float, nColor>(a, x, b, y));
+      }
+
+      void preTune() override { y.backup(); }
+      void postTune() override { y.restore(); }
+      long long flops() const override { return 6 * x.Volume() * x.Nspin() * x.Ncolor(); }
+      long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); }
+    };
+
+    void axpby(const std::vector<double> &a, const ColorSpinorField &x, const std::vector<double> &b, ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<axpby3D>(x, y, a, b);
+    }
+
+    void ax(const std::vector<double> &a, ColorSpinorField &x)
+    {
+      std::vector<double> zeros(a.size(), 0.0);
+      axpby(a, x, zeros, x);
+    }
+
+    template <typename Float, int nColor> class caxpby3D : TunableKernel2D
+    {
+      const ColorSpinorField &x;
+      ColorSpinorField &y;
+      const std::vector<Complex> &a;
+      const std::vector<Complex> &b;
+      unsigned int minThreads() const override { return x.VolumeCB(); }
+
+    public:
+      caxpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector<Complex> &a,
+               const std::vector<Complex> &b) :
+        TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream) override
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        launch<caxpby3d>(tp, stream, caxpby3dArg<Float, nColor>(a, x, b, y));
+      }
+
+      void preTune() override { y.backup(); }
+      void postTune() override { y.restore(); }
+      long long flops() const override { return 14 * x.Volume() * x.Nspin() * x.Ncolor(); }
+      long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); }
+    };
+
+    void caxpby(const std::vector<Complex> &a, const ColorSpinorField &x, const std::vector<Complex> &b,
+                ColorSpinorField &y)
+    {
+      checkPrecision(x, y);
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<caxpby3D>(x, y, a, b);
+    }
+
+    template <typename Float, int nColor> class reDotProduct3D : TunableMultiReduction
+    {
+      const ColorSpinorField &x;
+      const ColorSpinorField &y;
+      std::vector<double> &result;
+
+    public:
+      reDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector<double> &result) :
+        TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        reDotProduct3dArg<Float, nColor> arg(x, y);
+        launch<reDotProduct3d>(result, tp, stream, arg);
+      }
+
+      long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 2; }
+      long long bytes() const { return x.Bytes() + y.Bytes(); }
+    };
+
+    void reDotProduct(std::vector<double> &result, const ColorSpinorField &x, const ColorSpinorField &y)
+    {
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (result.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<reDotProduct3D>(x, y, result);
+    }
+
+    template <typename Float, int nColor> class cDotProduct3D : TunableMultiReduction
+    {
+      const ColorSpinorField &x;
+      const ColorSpinorField &y;
+      std::vector<Complex> &result;
+
+    public:
+      cDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector<Complex> &result) :
+        TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result)
+      {
+        apply(device::get_default_stream());
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        cDotProduct3dArg<Float, nColor> arg(x, y);
+        launch<cDotProduct3d>(result, tp, stream, arg);
+      }
+
+      long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 8; }
+      long long bytes() const { return x.Bytes() + y.Bytes(); }
+    };
+
+    void cDotProduct(std::vector<Complex> &result, const ColorSpinorField &x, const ColorSpinorField &y)
+    {
+      checkSpin(x, y);
+      checkColor(x, y);
+
+      // Check coefficients
+      if (result.size() != (unsigned int)x.X()[3])
+        errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]);
+
+      // We must give a Lattice field as the first argument
+      instantiate<cDotProduct3D>(x, y, result);
+    }
+
+  } // namespace blas3d
+
+} // namespace quda
diff --git a/lib/check_params.h b/lib/check_params.h
index 639db3eb3d..cdbe36169b 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -175,6 +175,7 @@ void printQudaEigParam(QudaEigParam *param) {
   P(preserve_deflation, QUDA_BOOLEAN_FALSE);
   P(preserve_deflation_space, 0);
   P(preserve_evals, QUDA_BOOLEAN_TRUE);
+  P(use_smeared_gauge, false);
   P(use_dagger, QUDA_BOOLEAN_FALSE);
   P(use_norm_op, QUDA_BOOLEAN_FALSE);
   P(compute_svd, QUDA_BOOLEAN_FALSE);
diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp
index cc244b43b8..e0e652c26c 100644
--- a/lib/communicator_mpi.cpp
+++ b/lib/communicator_mpi.cpp
@@ -106,6 +106,7 @@ namespace quda
     }
 
     comm_init(nDim, commDims, rank_from_coords, map_data);
+
     globalReduce.push(true);
   }
 
@@ -174,6 +175,9 @@ namespace quda
                 grid_size, size);
     }
 
+    // defer handling MPI errors to QUDA
+    MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN);
+
     comm_init_common(ndim, dims, rank_from_coords, map_data);
   }
 
diff --git a/lib/communicator_qmp.cpp b/lib/communicator_qmp.cpp
index 1a4a2a7878..67d4a8694f 100644
--- a/lib/communicator_qmp.cpp
+++ b/lib/communicator_qmp.cpp
@@ -166,6 +166,9 @@ void Communicator::comm_init(int ndim, const int *dims, QudaCommsMap rank_from_c
               grid_size, QMP_comm_get_number_of_nodes(QMP_COMM_HANDLE));
   }
 
+  // defer handling MPI errors to QMP
+  MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN);
+
   comm_init_common(ndim, dims, rank_from_coords, map_data);
 }
 
diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp
index 4abf6e38a2..cc36909d51 100644
--- a/lib/communicator_stack.cpp
+++ b/lib/communicator_stack.cpp
@@ -2,6 +2,7 @@
 #include <map>
 #include <array.h>
 #include <lattice_field.h>
+#include <tune_quda.h>
 
 namespace quda
 {
@@ -49,8 +50,16 @@ namespace quda
   void push_communicator(const CommKey &split_key)
   {
     if (comm_nvshmem_enabled())
-      errorQuda(
-        "Split-grid is currently not supported with NVSHMEM. Please set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM.");
+      errorQuda("Split-grid is currently not supported with NVSHMEM. Set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM.");
+
+    // if reverting to global, we will need to join the tunecaches
+    bool join_tune_cache = split_key == default_comm_key;
+    int local_rank = comm_rank();
+    int local_tune_rank = 0;
+
+    // used to store the size of the tunecache at the point of splitting
+    static size_t tune_cache_size = 0;
+
     auto search = communicator_stack.find(split_key);
     if (search == communicator_stack.end()) {
       communicator_stack.emplace(std::piecewise_construct, std::forward_as_tuple(split_key),
@@ -59,7 +68,38 @@ namespace quda
 
     LatticeField::freeGhostBuffer(); // Destroy the (IPC) Comm buffers with the old communicator.
 
+    auto split_key_old = current_key;
     current_key = split_key;
+
+    // we are returning to global so we need to join any diverged tunecaches
+    if (join_tune_cache) {
+      // has this tunecache been updated?
+      int tune_cache_update = tune_cache_size != getTuneCache().size();
+      // have any of tunecaches across the split grid been updated?
+      comm_allreduce_int(tune_cache_update);
+
+      if (tune_cache_update) {
+        auto num_sub_partition = split_key_old.product(); // the number of caches we need to merge
+        int sub_partition_dims[] = {comm_dim(0) / split_key_old[0], comm_dim(1) / split_key_old[1],
+                                    comm_dim(2) / split_key_old[2], comm_dim(3) / split_key_old[3]};
+        int sub_partition_coords[] = {comm_coord(0) / sub_partition_dims[0], comm_coord(1) / sub_partition_dims[1],
+                                      comm_coord(2) / sub_partition_dims[2], comm_coord(3) / sub_partition_dims[3]};
+        auto sub_partition_idx = sub_partition_coords[split_key_old.n_dim - 1];
+        for (auto d = split_key_old.n_dim - 2; d >= 0; d--)
+          sub_partition_idx = sub_partition_idx * split_key_old[d] + sub_partition_coords[d];
+
+        std::vector<int> global_tune_ranks(num_sub_partition);
+        for (auto i = 0; i < num_sub_partition; i++) {
+          global_tune_ranks[i] = (sub_partition_idx == i && local_tune_rank == local_rank) ? comm_rank() : 0;
+          comm_allreduce_int(global_tune_ranks[i]);
+        }
+        // we now have a list of all the global tune ranks so we can join them
+        joinTuneCache(global_tune_ranks);
+      }
+    } else if (!join_tune_cache) {
+      // record size of tunecache when first splitting the grid
+      tune_cache_size = getTuneCache().size();
+    }
   }
 
 #if defined(QMP_COMMS) || defined(MPI_COMMS)
@@ -70,8 +110,12 @@ namespace quda
 
   int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); }
 
+  int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); }
+
   int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); }
 
+  int comm_coord_global(int dim) { return get_default_communicator().comm_coord(dim); }
+
   int comm_rank_from_coords(const int *coords) { return get_current_communicator().comm_rank_from_coords(coords); }
 
   void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle,
diff --git a/lib/eig_trlm_3d.cpp b/lib/eig_trlm_3d.cpp
new file mode 100644
index 0000000000..7480a58395
--- /dev/null
+++ b/lib/eig_trlm_3d.cpp
@@ -0,0 +1,670 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include <quda_internal.h>
+#include <eigensolve_quda.h>
+#include <vector_io.h>
+#include <qio_field.h>
+#include <color_spinor_field.h>
+#include <random_quda.h>
+#include <blas_quda.h>
+#include <blas_3d.h>
+#include <util_quda.h>
+#include <tune_quda.h>
+#include <eigen_helper.h>
+#include <split_grid.h>
+
+namespace quda
+{
+  // Thick Restarted Lanczos Method constructor
+  TRLM3D::TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param) : EigenSolver(mat, eig_param)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+
+    ortho_dim = eig_param->ortho_dim;
+    ortho_dim_size = eig_param->ortho_dim_size_local;
+    if (ortho_dim != 3)
+      errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim);
+    verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0;
+
+    // Tridiagonal/Arrow matrices
+    alpha_3D.resize(ortho_dim_size);
+    beta_3D.resize(ortho_dim_size);
+
+    residua_3D.resize(ortho_dim_size);
+    ritz_mat_3D.resize(ortho_dim_size);
+    converged_3D.resize(ortho_dim_size, false);
+    active_3D.resize(ortho_dim_size, false);
+
+    iter_locked_3D.resize(ortho_dim_size, 0);
+    iter_keep_3D.resize(ortho_dim_size, 0);
+    iter_converged_3D.resize(ortho_dim_size, 0);
+
+    num_locked_3D.resize(ortho_dim_size, 0);
+    num_keep_3D.resize(ortho_dim_size, 0);
+    num_converged_3D.resize(ortho_dim_size, 0);
+
+    for (int i = 0; i < ortho_dim_size; i++) {
+      alpha_3D[i].resize(n_kr, 0.0);
+      beta_3D[i].resize(n_kr, 0.0);
+      residua_3D[i].resize(n_kr, 0.0);
+    }
+
+    // 3D thick restart specific checks
+    if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6);
+
+    if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) {
+      errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver");
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  void TRLM3D::operator()(std::vector<ColorSpinorField> &kSpace, std::vector<Complex> &evals)
+  {
+    // Create 3-d split communicators
+    CommKey split_key = {1, 1, 1, comm_dim(3)};
+
+    if (!split_key.is_valid()) {
+      errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]);
+    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
+            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
+    push_communicator(split_key);
+
+    // Override any user input for block size.
+    block_size = 1;
+
+    // Pre-launch checks and preparation
+    //---------------------------------------------------------------------------
+    queryPrec(kSpace[0].Precision());
+    // Check to see if we are loading eigenvectors
+    if (strcmp(eig_param->vec_infile, "") != 0) {
+      printfQuda("Loading evecs from file name %s\n", eig_param->vec_infile);
+      loadFromFile(kSpace, evals);
+      return;
+    }
+
+    // Check for an initial guess. If none present, populate with rands, then
+    // orthonormalise
+    prepareInitialGuess3D(kSpace, ortho_dim_size);
+
+    // Increase the size of kSpace passed to the function, will be trimmed to
+    // original size before exit.
+    prepareKrylovSpace(kSpace, evals);
+
+    // Check for Chebyshev maximum estimation
+    checkChebyOpMax(kSpace);
+
+    // Convergence and locking criteria
+    std::vector<double> mat_norm_3D(ortho_dim_size, 0.0);
+    double epsilon = setEpsilon(kSpace[0].Precision());
+
+    // Print Eigensolver params
+    printEigensolverSetup();
+    //---------------------------------------------------------------------------
+
+    // Begin TRLM Eigensolver computation
+    //---------------------------------------------------------------------------
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    int t_offset = ortho_dim_size * comm_coord(3);
+
+    // Loop over restart iterations.
+    while (restart_iter < max_restarts && !converged) {
+
+      // Get min step
+      int step_min = *std::min_element(num_locked_3D.begin(), num_locked_3D.end());
+      comm_allreduce_min(step_min);
+
+      for (int step = step_min; step < n_kr; step++) lanczosStep3D(kSpace, step);
+      iter += (n_kr - step_min);
+
+      // The eigenvalues are returned in the alpha array
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+      eigensolveFromArrowMat3D();
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+      // mat_norm is updated and used for LR
+      for (int t = 0; t < ortho_dim_size; t++)
+        for (int i = num_locked_3D[t]; i < n_kr; i++)
+          if (fabs(alpha_3D[t][i]) > mat_norm_3D[t]) mat_norm_3D[t] = fabs(alpha_3D[t][i]);
+
+      // Lambda that returns mat_norm for LR and returns the relevant alpha
+      // (the corresponding Ritz value) for SR
+      auto check_norm = [&](double sr_norm, int t) -> double {
+        if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG)
+          return mat_norm_3D[t];
+        else
+          return sr_norm;
+      };
+
+      // Locking check
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          iter_locked_3D[t] = 0;
+          for (int i = 0; i < (n_kr - num_locked_3D[t]); i++) {
+            if (residua_3D[t][i + num_locked_3D[t]] < epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) {
+              logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d %d resid=%+.6e condition=%.6e ****\n", t, i,
+                      residua_3D[t][i + num_locked_3D[t]], epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t));
+              iter_locked_3D[t] = i + 1;
+            } else {
+              // Unlikely to find new locked pairs
+              break;
+            }
+          }
+        }
+      }
+
+      // Convergence check
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          iter_converged_3D[t] = iter_locked_3D[t];
+          for (int i = iter_locked_3D[t]; i < n_kr - num_locked_3D[t]; i++) {
+            if (residua_3D[t][i + num_locked_3D[t]] < tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) {
+              logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d %d resid=%+.6e condition=%.6e ****\n", t, i,
+                      residua_3D[t][i + num_locked_3D[t]], tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t));
+              iter_converged_3D[t] = i + 1;
+            } else {
+              // Unlikely to find new converged pairs
+              break;
+            }
+          }
+        }
+      }
+
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t])
+          iter_keep_3D[t]
+            = std::min(iter_converged_3D[t] + (n_kr - num_converged_3D[t]) / 2, n_kr - num_locked_3D[t] - 12);
+      }
+
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+      computeKeptRitz3D(kSpace);
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
+      int min_nconv = n_kr + 1;
+      int min_nlock = n_kr + 1;
+      int min_nkeep = n_kr + 1;
+
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (!converged_3D[t]) {
+          num_converged_3D[t] = num_locked_3D[t] + iter_converged_3D[t];
+          num_keep_3D[t] = num_locked_3D[t] + iter_keep_3D[t];
+          num_locked_3D[t] += iter_locked_3D[t];
+
+          if (num_converged_3D[t] < min_nconv) min_nconv = num_converged_3D[t];
+          if (num_locked_3D[t] < min_nlock) min_nlock = num_locked_3D[t];
+          if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t];
+
+          // Use printf to get data from t dim only
+          if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) {
+            printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t],
+                   t_offset + t, restart_iter + 1);
+            printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]);
+            printf("iter Keep[%d] = %d\n", t_offset + t, iter_keep_3D[t]);
+            printf("iter Lock[%d] = %d\n", t_offset + t, iter_locked_3D[t]);
+            printf("num_converged[%d] = %d\n", t_offset + t, num_converged_3D[t]);
+            printf("num_keep[%d] = %d\n", t_offset + t, num_keep_3D[t]);
+            printf("num_locked[%d] = %d\n", t_offset + t, num_locked_3D[t]);
+            for (int i = 0; i < n_kr; i++) {
+              printf("Ritz[%d][%d] = %.16e residual[%d] = %.16e\n", t_offset + t, i, alpha_3D[t][i], i, residua_3D[t][i]);
+            }
+          }
+        }
+      }
+
+      // Check for convergence
+      bool all_converged = true;
+      for (int t = 0; t < ortho_dim_size; t++) {
+        if (num_converged_3D[t] >= n_conv) {
+          converged_3D[t] = true;
+        } else {
+          all_converged = false;
+        }
+      }
+
+      if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) {
+        std::stringstream converged;
+        std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator<int>(converged, ""));
+        printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1,
+               comm_rank_global(), converged.str().c_str(), min_nlock, min_nconv, min_nkeep);
+      }
+
+      if (all_converged) {
+        reorder3D(kSpace);
+        converged = true;
+      }
+
+      restart_iter++;
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
+    // Post computation report
+    //---------------------------------------------------------------------------
+    if (!converged) {
+      if (eig_param->require_convergence) {
+        errorQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d "
+                  "restart steps. Exiting.",
+                  n_conv, n_ev, n_kr, max_restarts);
+      } else {
+        warningQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d "
+                    "restart steps. Continuing with current Lanczos factorisation.",
+                    n_conv, n_ev, n_kr, max_restarts);
+        // Compute eigenvalues
+        computeEvals(kSpace, evals);
+      }
+    } else {
+      if (verbose_rank) {
+        if (getVerbosity() >= QUDA_SUMMARIZE) {
+          printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n",
+                 comm_rank_global(), n_conv, restart_iter, iter);
+        }
+
+        if (getVerbosity() >= QUDA_VERBOSE) {
+          // Dump all Ritz values and residua if using Chebyshev
+          if (eig_param->use_poly_acc) {
+            for (int t = 0; t < ortho_dim_size; t++) {
+              for (int i = 0; i < n_conv; i++) {
+                printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0,
+                       residua_3D[t][i]);
+              }
+            }
+          }
+        }
+      }
+
+      // Compute eigenvalues
+      computeEvals(kSpace, evals);
+    }
+
+    push_communicator(default_comm_key);
+
+    // ensure all processes have all e-values
+    comm_allreduce_sum(evals);
+
+    // Local clean-up
+    cleanUpEigensolver(kSpace, evals);
+  }
+
+  // Thick Restart 3D Member functions
+  //---------------------------------------------------------------------------
+  void TRLM3D::lanczosStep3D(std::vector<ColorSpinorField> &v, int j)
+  {
+    // Compute r[t] = A[t] * v_j[t] - b_{j-i}[t] * v_{j-1}[t]
+    // r[t] = A[t] * v_j[t]
+
+    // Use this while we have only axpby in place of axpy;
+    std::vector<double> unit(ortho_dim_size, 1.0);
+    std::vector<double> alpha_j(ortho_dim_size, 0.0);
+    std::vector<double> beta_j(ortho_dim_size, 0.0);
+
+    // 3D vectors that hold data for individual t components
+    ColorSpinorParam csParamClone(v[0]);
+    csParamClone.change_dim(ortho_dim, 1);
+    std::vector<ColorSpinorField> vecs_t(1, csParamClone);
+    ColorSpinorField r_t(csParamClone);
+
+    // Identify active 3D slices. The active_3D array
+    // should be modified here only throughout the entire
+    // algorithm.
+    for (int t = 0; t < ortho_dim_size; t++) {
+      // Every element of the active array must be assessed
+      active_3D[t] = (num_keep_3D[t] <= j && !converged_3D[t]) ? true : false;
+    }
+
+    // This will be a blocked operator with no
+    // connections in the ortho_dim (usually t)
+    // hence the 3D sections of each vector
+    // will be independent.
+    chebyOp(r[0], v[j]);
+
+    // a_j[t] = v_j^dag[t] * r[t]
+    blas3d::reDotProduct(alpha_j, v[j], r[0]);
+    for (int t = 0; t < ortho_dim_size; t++) {
+      // Only active problem data is recorded
+      if (active_3D[t]) alpha_3D[t][j] = alpha_j[t];
+    }
+
+    // r[t] = r[t] - a_j[t] * v_j[t]
+    for (int t = 0; t < ortho_dim_size; t++) alpha_j[t] = active_3D[t] ? -alpha_j[t] : 0.0;
+    blas3d::axpby(alpha_j, v[j], unit, r[0]);
+
+    // r[t] = r[t] - b_{j-1}[t] * v_{j-1}[t]
+    // Only orthogonalise active problems
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (active_3D[t]) {
+        int start = (j > num_keep_3D[t]) ? j - 1 : 0;
+        if (j - start > 0) {
+
+          // Ensure we have enough 3D vectors
+          if ((int)vecs_t.size() < j - start) resize(vecs_t, j - start, csParamClone);
+
+          // Copy the 3D data into the 3D vectors, create beta array, and create
+          // pointers to the 3D vectors
+          std::vector<double> beta_t;
+          beta_t.reserve(j - start);
+          // Copy residual
+          blas3d::copy(t, blas3d::copyType::COPY_TO_3D, r_t, r[0]);
+          // Copy vectors
+          for (int i = start; i < j; i++) {
+            blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i - start], v[i]);
+            beta_t.push_back(-beta_3D[t][i]);
+          }
+
+          // r[t] = r[t] - beta[t]{j-1} * v[t]{j-1}
+          blas::block::axpy(beta_t, {vecs_t.begin(), vecs_t.begin() + j - start}, r_t);
+
+          // Copy residual back to 4D vector
+          blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, r_t, r[0]);
+        }
+      }
+    }
+
+    // Orthogonalise r against the Krylov space
+    for (int k = 0; k < 1; k++) blockOrthogonalize3D(v, r, j + 1); //  future work: up to 4 times???
+
+    // b_j[t] = ||r[t]||
+    blas3d::reDotProduct(beta_j, r[0], r[0]);
+    for (int t = 0; t < ortho_dim_size; t++) beta_j[t] = active_3D[t] ? sqrt(beta_j[t]) : 0.0;
+
+    // Prepare next step.
+    // v_{j+1}[t] = r[t] / b_{j}[t]
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (active_3D[t]) {
+        beta_3D[t][j] = beta_j[t];
+        beta_j[t] = 1.0 / beta_j[t];
+      }
+    }
+
+    std::vector<double> c(ortho_dim_size);
+    for (int t = 0; t < ortho_dim_size; t++) c[t] = beta_j[t] == 0.0 ? 1.0 : 0.0;
+    blas3d::axpby(beta_j, r[0], c, v[j + 1]);
+  }
+
+  void TRLM3D::reorder3D(std::vector<ColorSpinorField> &kSpace)
+  {
+    for (int t = 0; t < ortho_dim_size; t++) {
+      int i = 0;
+      if (reverse) {
+        while (i < n_kr) {
+          if ((i == 0) || (alpha_3D[t][i - 1] >= alpha_3D[t][i]))
+            i++;
+          else {
+            std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]);
+            blas3d::swap(t, kSpace[i], kSpace[i - 1]);
+            i--;
+          }
+        }
+      } else {
+        while (i < n_kr) {
+          if ((i == 0) || (alpha_3D[t][i - 1] <= alpha_3D[t][i]))
+            i++;
+          else {
+            std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]);
+            blas3d::swap(t, kSpace[i], kSpace[i - 1]);
+            i--;
+          }
+        }
+      }
+    }
+  }
+
+  void TRLM3D::eigensolveFromArrowMat3D()
+  {
+    getProfile().TPSTART(QUDA_PROFILE_EIGEN);
+
+    // Loop over the 3D problems
+#pragma omp parallel for
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (!converged_3D[t]) {
+
+        int dim = n_kr - num_locked_3D[t];
+        int arrow_pos = num_keep_3D[t] - num_locked_3D[t];
+
+        // Eigen objects
+        MatrixXd A = MatrixXd::Zero(dim, dim);
+        ritz_mat_3D[t].resize(dim * dim);
+        for (int i = 0; i < dim * dim; i++) ritz_mat_3D[t][i] = 0.0;
+
+        // Invert the spectrum due to chebyshev
+        if (reverse) {
+          for (int i = num_locked_3D[t]; i < n_kr - 1; i++) {
+            alpha_3D[t][i] *= -1.0;
+            beta_3D[t][i] *= -1.0;
+          }
+          alpha_3D[t][n_kr - 1] *= -1.0;
+        }
+
+        // Construct arrow mat A_{dim,dim}
+        for (int i = 0; i < dim; i++) {
+
+          // alpha_3D populates the diagonal
+          A(i, i) = alpha_3D[t][i + num_locked_3D[t]];
+        }
+
+        for (int i = 0; i < arrow_pos; i++) {
+
+          // beta_3D populates the arrow
+          A(i, arrow_pos) = beta_3D[t][i + num_locked_3D[t]];
+          A(arrow_pos, i) = beta_3D[t][i + num_locked_3D[t]];
+        }
+
+        for (int i = arrow_pos; i < dim - 1; i++) {
+
+          // beta_3D populates the sub-diagonal
+          A(i, i + 1) = beta_3D[t][i + num_locked_3D[t]];
+          A(i + 1, i) = beta_3D[t][i + num_locked_3D[t]];
+        }
+
+        // Eigensolve the arrow matrix
+        SelfAdjointEigenSolver<MatrixXd> eigensolver;
+        eigensolver.compute(A);
+
+        // repopulate ritz matrix
+        for (int i = 0; i < dim; i++)
+          for (int j = 0; j < dim; j++) ritz_mat_3D[t][dim * i + j] = eigensolver.eigenvectors().col(i)[j];
+
+        for (int i = 0; i < dim; i++) {
+          residua_3D[t][i + num_locked_3D[t]] = fabs(beta_3D[t][n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]);
+          // Update the alpha_3D array
+          alpha_3D[t][i + num_locked_3D[t]] = eigensolver.eigenvalues()[i];
+        }
+
+        // Put spectrum back in order
+        if (reverse) {
+          for (int i = num_locked_3D[t]; i < n_kr; i++) { alpha_3D[t][i] *= -1.0; }
+        }
+      }
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_EIGEN);
+  }
+
+  void TRLM3D::computeKeptRitz3D(std::vector<ColorSpinorField> &kSpace)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    // Multi-BLAS friendly array to store part of Ritz matrix we want
+    std::vector<std::vector<double>> ritz_mat_keep(ortho_dim_size);
+
+    std::vector<ColorSpinorField> vecs_t;
+    std::vector<ColorSpinorField> kSpace_t;
+
+    ColorSpinorParam csParamClone(kSpace[0]);
+    csParamClone.create = QUDA_ZERO_FIELD_CREATE;
+    csParamClone.change_dim(ortho_dim, 1);
+
+    for (int t = 0; t < ortho_dim_size; t++) {
+      if (!converged_3D[t]) {
+        int dim = n_kr - num_locked_3D[t];
+        int keep = iter_keep_3D[t];
+
+        ritz_mat_keep[t].resize(dim * keep);
+        for (int j = 0; j < dim; j++) {
+          for (int i = 0; i < keep; i++) { ritz_mat_keep[t][j * keep + i] = ritz_mat_3D[t][i * dim + j]; }
+        }
+
+        // Alias the vectors we wish to keep.
+        vector_ref<ColorSpinorField> vecs_locked(kSpace.begin() + num_locked_3D[t],
+                                                 kSpace.begin() + num_locked_3D[t] + dim);
+
+        // multiBLAS axpy. Create 3D vectors so that we may perform all t independent
+        // vector rotations.
+        vecs_t.reserve(dim);
+        kSpace_t.reserve(keep);
+
+        // Create 3D arrays
+        for (int i = vecs_t.size(); i < dim; i++) vecs_t.push_back(csParamClone);
+        for (int i = kSpace_t.size(); i < keep; i++) kSpace_t.push_back(csParamClone);
+
+        blas::zero(kSpace_t);
+
+        // Copy to data to 3D array, zero out workspace, make pointers
+        for (int i = 0; i < dim; i++) blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i], vecs_locked[i]);
+
+        // Compute the axpy
+        blas::block::axpy(ritz_mat_keep[t], {vecs_t.begin(), vecs_t.begin() + dim},
+                          {kSpace_t.begin(), kSpace_t.begin() + keep});
+
+        // Copy back to the 4D workspace array
+
+        // Copy compressed Krylov
+        for (int i = 0; i < keep; i++) {
+          blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, kSpace_t[i], kSpace[num_locked_3D[t] + i]);
+        }
+
+        // Update residual vector
+        blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[0], kSpace[n_kr]);
+        blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, vecs_t[0], kSpace[num_locked_3D[t] + keep]);
+
+        // Update sub arrow matrix
+        for (int i = 0; i < keep; i++)
+          beta_3D[t][i + num_locked_3D[t]] = beta_3D[t][n_kr - 1] * ritz_mat_3D[t][dim * (i + 1) - 1];
+      }
+    }
+
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+  }
+
+  // Orthogonalise r[t][0:] against V_[t][0:j]
+  void TRLM3D::blockOrthogonalize3D(std::vector<ColorSpinorField> &vecs, std::vector<ColorSpinorField> &rvecs, int j)
+  {
+    for (int i = 0; i < j; i++) {
+      std::vector<Complex> s_t(ortho_dim_size, 0.0);
+      std::vector<Complex> unit_new(ortho_dim_size, {1.0, 0.0});
+
+      // Block dot products stored in s_t.
+      blas3d::cDotProduct(s_t, vecs[i], rvecs[0]);
+      for (int t = 0; t < ortho_dim_size; t++) { s_t[t] *= active_3D[t] ? -1.0 : 0.0; }
+
+      // Block orthogonalise
+      blas3d::caxpby(s_t, vecs[i], unit_new, rvecs[0]);
+    }
+  }
+
+  void TRLM3D::prepareInitialGuess3D(std::vector<ColorSpinorField> &kSpace, int ortho_dim_size)
+  {
+    auto nrm = blas::norm2(kSpace[0]);
+    if (!std::isfinite(nrm) || nrm == 0.0) {
+      RNG rng(kSpace[0], 1234);
+      spinorNoise(kSpace[0], rng, QUDA_NOISE_UNIFORM);
+    }
+
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<double> zeros(ortho_dim_size, 0.0);
+    blas3d::reDotProduct(norms, kSpace[0], kSpace[0]);
+    for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]);
+    blas3d::axpby(norms, kSpace[0], zeros, kSpace[0]);
+  }
+
+  double TRLM3D::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in)
+  {
+    RNG rng(in, 1234);
+    spinorNoise(in, rng, QUDA_NOISE_UNIFORM);
+
+    // Power iteration
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<double> zeros(ortho_dim_size, 0.0);
+    for (int i = 0; i < 100; i++) {
+      if ((i + 1) % 10 == 0) {
+        blas3d::reDotProduct(norms, in, in);
+        for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]);
+        blas3d::axpby(norms, in, zeros, in);
+      }
+      mat(out, in);
+      std::swap(out, in);
+    }
+
+    // Compute spectral radius estimate
+    std::vector<double> inner_products(ortho_dim_size, 0.0);
+    blas3d::reDotProduct(inner_products, out, in);
+
+    auto result = *std::max_element(inner_products.begin(), inner_products.end());
+    comm_allreduce_max(result);
+    if (verbose_rank && getVerbosity() >= QUDA_VERBOSE)
+      printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global());
+
+    // Increase final result by 10% for safety
+    return result * 1.10;
+  }
+
+  void TRLM3D::computeEvals(std::vector<ColorSpinorField> &evecs, std::vector<Complex> &evals, int size)
+  {
+    if (size == 0) size = n_conv;
+    if (size > (int)evecs.size())
+      errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size());
+    if (size > (int)evals.size())
+      errorQuda("Requesting %d eigenvalues with only storage allocated for %lu", size, evals.size());
+
+    ColorSpinorParam csParamClone(evecs[0]);
+    ColorSpinorField tmp(csParamClone);
+
+    std::vector<std::vector<Complex>> evals_t(size, std::vector<Complex>(ortho_dim_size, 0.0));
+    std::vector<double> norms(ortho_dim_size, 0.0);
+    std::vector<Complex> unit(ortho_dim_size, 1.0);
+    std::vector<Complex> n_unit(ortho_dim_size, {-1.0, 0.0});
+
+    for (int i = 0; i < size; i++) {
+      // r = A * v_i
+      mat(tmp, evecs[i]);
+
+      // lambda_i = v_i^dag A v_i / (v_i^dag * v_i)
+      // Block dot products stored in s_t.
+      blas3d::cDotProduct(evals_t[i], evecs[i], tmp);
+      blas3d::reDotProduct(norms, evecs[i], evecs[i]);
+      for (int t = 0; t < ortho_dim_size; t++) evals_t[i][t] /= sqrt(norms[t]);
+
+      // Measure ||lambda_i*v_i - A*v_i||
+      blas3d::caxpby(evals_t[i], evecs[i], n_unit, tmp);
+      blas3d::reDotProduct(norms, tmp, tmp);
+      for (int t = 0; t < ortho_dim_size; t++) residua_3D[t][i] = sqrt(norms[t]);
+    }
+
+    // If size = n_conv, this routine is called post sort
+    if (size == n_conv) {
+      evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0);
+
+      if (verbose_rank) {
+        int t_offset = ortho_dim_size * comm_coord_global(3);
+        for (int t = 0; t < ortho_dim_size; t++) {
+          for (int i = 0; i < size; i++) {
+
+            // Use printf to get data from t dim only
+            if (getVerbosity() >= QUDA_VERBOSE) {
+              printf("Eval[%02d][%04d] = (%+.16e,%+.16e) residual = %+.16e\n", t_offset + t, i, evals_t[i][t].real(),
+                     evals_t[i][t].imag(), residua_3D[t][i]);
+            }
+
+            // Transfer evals to eval array
+            evals[(t_offset + t) * size + i] = evals_t[i][t];
+          }
+        }
+      }
+    }
+  }
+
+} // namespace quda
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 8130897cbb..52f4fd1100 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -104,6 +104,10 @@ namespace quda
       logQuda(QUDA_VERBOSE, "Creating TR Lanczos eigensolver\n");
       eig_solver = new TRLM(mat, eig_param);
       break;
+    case QUDA_EIG_TR_LANCZOS_3D:
+      logQuda(QUDA_VERBOSE, "Creating TR Lanczos 3-d eigensolver\n");
+      eig_solver = new TRLM3D(mat, eig_param);
+      break;
     case QUDA_EIG_BLK_TR_LANCZOS:
       logQuda(QUDA_VERBOSE, "Creating Block TR Lanczos eigensolver\n");
       eig_solver = new BLKTRLM(mat, eig_param);
@@ -236,7 +240,7 @@ namespace quda
     int n_eig = n_conv;
     if (compute_svd) n_eig *= 2;
     kSpace.resize(n_eig);
-    evals.resize(n_conv);
+    if (eig_param->eig_type != QUDA_EIG_TR_LANCZOS_3D) evals.resize(n_conv);
 
     // Only save if outfile is defined
     if (strcmp(eig_param->vec_outfile, "") != 0) {
@@ -580,6 +584,7 @@ namespace quda
   void EigenSolver::computeEvals(std::vector<ColorSpinorField> &evecs,
                                  std::vector<Complex> &evals, int size)
   {
+    if (size == 0) size = n_conv;
     auto batch_size = eig_param->compute_evals_batch_size;
 
     if (size > static_cast<int>(evecs.size()))
@@ -683,7 +688,7 @@ namespace quda
       case QUDA_SPECTRUM_SR_EIG: printfQuda("'SR' -> sort with real(x) in increasing algebraic order, smallest first.\n"); break;
       case QUDA_SPECTRUM_LI_EIG: printfQuda("'LI' -> sort with imag(x) in decreasing algebraic order, largest first.\n"); break;
       case QUDA_SPECTRUM_SI_EIG: printfQuda("'SI' -> sort with imag(x) in increasing algebraic order, smallest first\n"); break;
-      default: errorQuda("Unkown spectrum type requested: %d", spec_type);
+      default: errorQuda("Unknown spectrum type requested: %d", spec_type);
       }
     }
 
diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp
index 00843bb272..752670b676 100644
--- a/lib/gauge_laplace.cpp
+++ b/lib/gauge_laplace.cpp
@@ -29,7 +29,7 @@ namespace quda {
       comm_dim[i] = comm_dim_partitioned(i);
       if (laplace3D == i) comm_dim[i] = 0;
     }
-    ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile);
+    ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, comm_dim, profile);
   }
 
   void GaugeLaplace::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -43,7 +43,7 @@ namespace quda {
       comm_dim[i] = comm_dim_partitioned(i);
       if (laplace3D == i) comm_dim[i] = 0;
     }
-    ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile);
+    ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, comm_dim, profile);
   }
 
   void GaugeLaplace::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -79,7 +79,11 @@ namespace quda {
     // do nothing
   }
 
-  GaugeLaplacePC::GaugeLaplacePC(const DiracParam &param) : GaugeLaplace(param) { }
+  GaugeLaplacePC::GaugeLaplacePC(const DiracParam &param) : GaugeLaplace(param)
+  {
+    for (auto i = 0; i < 4; i++)
+      if (laplace3D == i) errorQuda("Cannot use 3-d operator with e/o preconditioning");
+  }
 
   GaugeLaplacePC::GaugeLaplacePC(const GaugeLaplacePC &dirac) : GaugeLaplace(dirac) { }
 
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 1d0f7d3688..6fd6382488 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -1434,7 +1434,7 @@ void endQuda(void)
 
 namespace quda {
 
-  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     double kappa = inv_param->kappa;
     if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
@@ -1549,7 +1549,7 @@ namespace quda {
     diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel;
   }
 
-  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1567,7 +1567,7 @@ namespace quda {
                 inv_param->cuda_prec_sloppy);
   }
 
-  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1586,7 +1586,7 @@ namespace quda {
   }
 
   // The preconditioner currently mimicks the sloppy operator with no comms
-  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms)
+  void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool comms)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1617,7 +1617,7 @@ namespace quda {
                 inv_param->cuda_prec_precondition);
   }
 
-  void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
+  void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool use_smeared_gauge)
   {
     setDiracParam(diracParam, inv_param, pc);
 
@@ -1625,6 +1625,20 @@ namespace quda {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended;
       diracParam.fatGauge = gaugeFatExtended;
       diracParam.longGauge = gaugeLongExtended;
+    } else if (use_smeared_gauge) {
+      if (!gaugeSmeared) errorQuda("No smeared gauge field present");
+      if (inv_param->dslash_type == QUDA_LAPLACE_DSLASH) {
+        if (gaugeSmeared->GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) {
+          GaugeFieldParam gauge_param(*gaugePrecise);
+          GaugeField gaugeEig(gauge_param);
+          copyExtendedGauge(gaugeEig, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
+          gaugeEig.exchangeGhost();
+          std::swap(gaugeEig, *gaugeSmeared);
+        }
+        diracParam.gauge = gaugeSmeared;
+      } else {
+        errorQuda("Smeared gauge field not supported for operator %d", inv_param->dslash_type);
+      }
     } else {
       diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatEigensolver : gaugeEigensolver;
       diracParam.fatGauge = gaugeFatEigensolver;
@@ -1646,7 +1660,7 @@ namespace quda {
                 inv_param->cuda_prec_eigensolver);
   }
 
-  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
+  void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1663,8 +1677,7 @@ namespace quda {
     dPre = Dirac::create(diracPreParam);
   }
 
-  void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param,
-                             const bool pc_solve)
+  void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam &param, bool pc_solve)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1684,8 +1697,8 @@ namespace quda {
     dRef = Dirac::create(diracRefParam);
   }
 
-  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,
-                          const bool pc_solve)
+  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param, bool pc_solve,
+                          bool use_smeared_gauge)
   {
     DiracParam diracParam;
     DiracParam diracSloppyParam;
@@ -1696,7 +1709,7 @@ namespace quda {
     setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
     bool pre_comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true;
     setDiracPreParam(diracPreParam, &param, pc_solve, pre_comms_flag);
-    setDiracEigParam(diracEigParam, &param, pc_solve);
+    setDiracEigParam(diracEigParam, &param, pc_solve, use_smeared_gauge);
 
     d = Dirac::create(diracParam); // create the Dirac operator
     dSloppy = Dirac::create(diracSloppyParam);
@@ -2392,14 +2405,17 @@ void checkClover(QudaInvertParam *param) {
 
 quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
-  quda::GaugeField *cudaGauge = nullptr;
-  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
-    if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
+  quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise :
+                                                                   gaugePrecise;
 
-    if (param->cuda_prec != gaugePrecise->Precision()) {
-      errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision());
-    }
+  if (U == nullptr)
+    errorQuda("Precise gauge %sfield doesn't exist", param->dslash_type == QUDA_ASQTAD_DSLASH ? "fat " : "");
+
+  if (param->cuda_prec != U->Precision()) {
+    errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision());
+  }
 
+  if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (param->cuda_prec_sloppy != gaugeSloppy->Precision()
         || param->cuda_prec_precondition != gaugePrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision()
@@ -2416,18 +2432,10 @@ quda::GaugeField *checkGauge(QudaInvertParam *param)
     if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist");
     if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist");
     if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist");
-    if (param->overlap) {
-      if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
-    }
-    cudaGauge = gaugePrecise;
+    if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist");
   } else {
-    if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist");
     if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist");
 
-    if (param->cuda_prec != gaugeFatPrecise->Precision()) {
-      errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision());
-    }
-
     if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision()
         || param->cuda_prec_precondition != gaugeFatPrecondition->Precision()
         || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision()
@@ -2450,23 +2458,18 @@ quda::GaugeField *checkGauge(QudaInvertParam *param)
     if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist");
     if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist");
     if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist");
-    if (param->overlap) {
-      if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
-    }
+    if (param->overlap && gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist");
 
     if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist");
     if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist");
     if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist");
     if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist");
-    if (param->overlap) {
-      if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
-    }
-    cudaGauge = gaugeFatPrecise;
+    if (param->overlap && gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist");
   }
 
   checkClover(param);
 
-  return cudaGauge;
+  return U;
 }
 
 void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse)
@@ -2582,7 +2585,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 
   // Create the dirac operator with a sloppy and a precon.
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
-  createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve);
+  createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve, eig_param->use_smeared_gauge);
   Dirac &dirac = *dEig;
   //------------------------------------------------------
 
@@ -2679,7 +2682,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // Transfer Eigenpairs back to host if using GPU eigensolver. The copy
   // will automatically rotate from device UKQCD gamma basis to the
   // host side gamma basis.
-  for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }
+  memcpy(host_evals, evals.data(), sizeof(Complex) * evals.size());
+
   if (!(eig_param->arpack_check)) {
     for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i];
   }
@@ -4977,7 +4981,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
 
   for (unsigned int i = 0; i < n_steps; i++) {
     if (i) in = out;
-    ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal);
+    ApplyLaplace(out, in, *precise, 3, a, b, in, parity, comm_dim, profileWuppertal);
     logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
   }
 
@@ -5123,7 +5127,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
-  logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
+  logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", 0,
+          obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1],
+          obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge);
 
   // set default dir_ignore = 3 for APE and STOUT for compatibility
   int dir_ignore = smear_param->dir_ignore;
@@ -5142,13 +5148,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     case QUDA_GAUGE_SMEAR_HYP:
       HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore);
       break;
-    default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
+    default: errorQuda("Unknown gauge smear type %d", smear_param->smear_type);
     }
 
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++;
       gaugeObservablesQuda(&obs_param[measurement_n]);
-      logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
+      logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n",
+              i + 1, obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1],
+              obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge);
     }
   }
 
@@ -5297,7 +5305,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] = Laplace [0]
     copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profileGFlow);
 
     // [0] = [4] = Laplace [0] = Laplace [3]
     f_temp0 = f_temp4;
@@ -5315,7 +5323,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] <- Laplace [1]
     copyExtendedGauge(precise, gout, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, comm_dim, profileGFlow);
 
     // [1] <- [4]
     f_temp1 = f_temp4;
@@ -5333,7 +5341,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG
     // [4] <- Laplace [2]
     copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION);
     precise.exchangeGhost();
-    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, false, comm_dim, profileGFlow);
+    ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profileGFlow);
 
     // [2] <- [4] = Laplace [2]
     f_temp2 = f_temp4;
diff --git a/lib/laplace.cu b/lib/laplace.cu
index aab6dd2d8e..5e5c3803eb 100644
--- a/lib/laplace.cu
+++ b/lib/laplace.cu
@@ -151,18 +151,18 @@ namespace quda
 
     LaplaceApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, int dir, double a, double b, int parity,
-                 bool dagger, const int *comm_override, TimeProfile &profile)
+                 const int *comm_override, TimeProfile &profile)
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
       if (in.Nspin() == 1) {
         constexpr int nSpin = 1;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override);
+        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
         Laplace<decltype(arg)> laplace(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
       } else if (in.Nspin() == 4) {
         constexpr int nSpin = 4;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override);
+        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
         Laplace<decltype(arg)> laplace(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
       } else {
@@ -175,11 +175,11 @@ namespace quda
   // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x)
   // Omits direction 'dir' from the operator.
   void ApplyLaplace(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
-                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                    int dir, double a, double b, cvector_ref<const ColorSpinorField> &x, int parity,
                     const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_LAPLACE_DSLASH>()) {
-      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, dagger, comm_override, profile);
+      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, comm_override, profile);
     } else {
       errorQuda("Laplace operator has not been enabled");
     }
diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90
index 8eda362fbe..fc7666dd60 100644
--- a/lib/quda_fortran.F90
+++ b/lib/quda_fortran.F90
@@ -224,9 +224,6 @@ module quda_fortran
      real(8) :: temp
      real(8) :: clock
 
-     ! Enable auto-tuning?
-     QudaTune :: tune
-
      ! Number of steps in s-step algorithms
      integer(4) :: nsteps
 
diff --git a/lib/reduce_helper.cu b/lib/reduce_helper.cu
index cf95032656..72d18ba433 100644
--- a/lib/reduce_helper.cu
+++ b/lib/reduce_helper.cu
@@ -44,7 +44,7 @@ namespace quda
       void apply(const qudaStream_t &stream)
       {
         // intentionally do not autotune, since this can be called inside a tuning region
-        auto tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity());
+        auto tp = tuneLaunch(*this, false, getVerbosity());
         launch_device<init_count>(tp, stream, init_arg<T>(reduce_count, n_reduce));
       }
     };
diff --git a/lib/solve.cpp b/lib/solve.cpp
index c9ba9baf21..79fa868633 100644
--- a/lib/solve.cpp
+++ b/lib/solve.cpp
@@ -317,8 +317,8 @@ namespace quda
     getProfile().TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
-  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param,
-                          const bool pc_solve);
+  void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam &param, bool pc_solve,
+                          bool use_smeared_gauge);
 
   extern std::vector<ColorSpinorField> solutionResident;
 
@@ -349,7 +349,8 @@ namespace quda
 
     // Create the dirac operator and operators for sloppy, precondition,
     // and an eigensolver
-    createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve);
+    createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve,
+                       param.eig_param ? static_cast<QudaEigParam *>(param.eig_param)->use_smeared_gauge : false);
 
     // wrap CPU host side pointers
     ColorSpinorParam cpuParam(hp_b[0], param, u.X(), pc_solution, param.input_location);
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 3bcd7da964..2aefd3988f 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -152,9 +152,37 @@ namespace quda
   const map &getTuneCache() { return tunecache; }
 
   /**
-   * Deserialize tunecache from an istream, useful for reading a file or receiving from other nodes.
+   * @brief Distribute the tunecache from a given rank to all other nodes.
+   * @param[in] root_rank From which global rank to do the broadcast
+   * @param[out] tc Where we wish to receive the tunecache.  This
+   * defaults to the local tunecache.
+   */
+  static void broadcastTuneCache(int32_t root_rank = 0, map &tc_recv = tunecache);
+
+  void joinTuneCache(const std::vector<int> &global_tune_rank)
+  {
+    // vector of the split tunecaches
+    std::vector<map> split_tc(global_tune_rank.size());
+    // broadcast each tunecache to every process
+    for (auto i = 0u; i < global_tune_rank.size(); i++) {
+      broadcastTuneCache(global_tune_rank[i], split_tc[i]);
+      if (comm_rank() == global_tune_rank[i]) split_tc[i] = tunecache;
+      logQuda(QUDA_DEBUG_VERBOSE, "i = %d tune_rank = %d tc size = %lu\n", i, global_tune_rank[i], split_tc[i].size());
+    }
+
+    // now merge the maps
+    tunecache = split_tc[0];
+    for (auto i = 1u; i < global_tune_rank.size(); i++) { tunecache.merge(split_tc[i]); }
+  }
+
+  /**
+   * Deserialize tunecache from an istream, useful for reading a file
+   * or receiving from other nodes.
+   * @param[in] in The stream from which we are deserializing
+   * @param[out] tc The tunecache to which we are deserializing.  This
+   * defaults to the local tunecache.
    */
-  static void deserializeTuneCache(std::istream &in)
+  static void deserializeTuneCache(std::istream &in, map &tc = tunecache)
   {
     std::string line;
     std::stringstream ls;
@@ -185,7 +213,7 @@ namespace quda
       ls.ignore(1);               // throw away tab before comment
       getline(ls, param.comment); // assume anything remaining on the line is a comment
       param.comment += "\n";      // our convention is to include the newline, since ctime() likes to do this
-      tunecache[key] = param;
+      tc[key] = param;
     }
   }
 
@@ -320,30 +348,26 @@ namespace quda
     }
   }
 
-  /**
-   * @brief Distribute the tunecache from a given rank to all other nodes.
-   * @param[in] root_rank From which global rank to do the broadcast
-   */
-  static void broadcastTuneCache(int32_t root_rank = 0)
+  static void broadcastTuneCache(int32_t root_rank, map &tc_recv)
   {
     std::stringstream serialized;
     size_t size;
 
-    if (comm_rank_global() == root_rank) {
+    if (comm_rank() == root_rank) {
       serializeTuneCache(serialized);
       size = serialized.str().length();
     }
-    comm_broadcast_global(&size, sizeof(size_t), root_rank);
+    comm_broadcast(&size, sizeof(size_t), root_rank);
 
     if (size > 0) {
-      if (comm_rank_global() == root_rank) {
-        comm_broadcast_global(const_cast<char *>(serialized.str().c_str()), size, root_rank);
+      if (comm_rank() == root_rank) {
+        comm_broadcast(const_cast<char *>(serialized.str().c_str()), size, root_rank);
       } else {
         std::vector<char> serstr(size + 1);
-        comm_broadcast_global(serstr.data(), size, root_rank);
+        comm_broadcast(serstr.data(), size, root_rank);
         serstr[size] = '\0'; // null-terminate
         serialized.str(serstr.data());
-        deserializeTuneCache(serialized);
+        deserializeTuneCache(serialized, tc_recv);
       }
     }
   }
@@ -353,7 +377,7 @@ namespace quda
    */
   void loadTuneCache()
   {
-    if (getTuning() == QUDA_TUNE_NO) {
+    if (!getTuning()) {
       warningQuda("Autotuning disabled");
       return;
     }
@@ -826,20 +850,20 @@ namespace quda
      */
     void broadcast(int32_t root_rank)
     {
-      size_t size;
+      size_t size = 0;
       std::string serialized;
-      if (comm_rank_global() == root_rank) {
+      if (comm_rank() == root_rank) {
         serialized = serialize();
         size = serialized.length();
       }
-      comm_broadcast_global(&size, sizeof(size_t), root_rank);
+      comm_broadcast(&size, sizeof(size_t), root_rank);
 
       if (size > 0) {
-        if (comm_rank_global() == root_rank) {
-          comm_broadcast_global(const_cast<char *>(serialized.c_str()), size, root_rank);
+        if (comm_rank() == root_rank) {
+          comm_broadcast(const_cast<char *>(serialized.c_str()), size, root_rank);
         } else {
           std::vector<char> serstr(size + 1);
-          comm_broadcast_global(serstr.data(), size, root_rank);
+          comm_broadcast(serstr.data(), size, root_rank);
           serstr[size] = '\0'; // null-terminate
           std::string_view deserialized(serstr.data());
           deserialize(deserialized);
@@ -859,7 +883,7 @@ namespace quda
    * Return the optimal launch parameters for a given kernel, either
    * by retrieving them from tunecache or autotuning on the spot.
    */
-  TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
+  TuneParam tuneLaunch(Tunable &tunable, bool enabled, QudaVerbosity verbosity)
   {
 #ifdef LAUNCH_TIMER
     launchTimer.TPSTART(QUDA_PROFILE_TOTAL);
@@ -880,7 +904,7 @@ namespace quda
     it = tunecache.find(key);
 
     // first check if we have the tuned value and return if we have it
-    if (enabled == QUDA_TUNE_YES && it != tunecache.end()) {
+    if (enabled && it != tunecache.end()) {
 
 #ifdef LAUNCH_TIMER
       launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE);
@@ -926,7 +950,7 @@ namespace quda
 
     static TuneParam param;
 
-    if (enabled == QUDA_TUNE_NO) {
+    if (!enabled) {
       TuneParam param_default;
       param_default.aux = make_int4(-1, -1, -1, -1);
       tunable.defaultTuneParam(param_default);
@@ -959,7 +983,7 @@ namespace quda
          - we are tuning an uber kernel
          in which case do the tuning on all ranks since we can't
          guarantee that all nodes are partaking */
-      if (comm_rank_global() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) {
+      if (comm_rank() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) {
         TuneParam best_param;
         TuneCandidates tc(tunable.num_candidates());
         float best_time;
diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp
index e279c0f43e..b033efb20c 100644
--- a/lib/util_quda.cpp
+++ b/lib/util_quda.cpp
@@ -51,17 +51,18 @@ bool getRankVerbosity() {
   return rank_verbosity;
 }
 
+static bool tune = true;
+
 // default has autotuning enabled but can be overridden with the QUDA_ENABLE_TUNING environment variable
-QudaTune getTuning() {
+bool getTuning()
+{
   static bool init = false;
-  static QudaTune tune = QUDA_TUNE_YES;
-
   if (!init) {
     char *enable_tuning = getenv("QUDA_ENABLE_TUNING");
-    if (!enable_tuning || strcmp(enable_tuning,"0")!=0) {
-      tune = QUDA_TUNE_YES;
+    if (!enable_tuning || strcmp(enable_tuning, "0") != 0) {
+      tune = true;
     } else {
-      tune = QUDA_TUNE_NO;
+      tune = false;
     }
     init = true;
   }
@@ -69,6 +70,34 @@ QudaTune getTuning() {
   return tune;
 }
 
+void setTuning(bool tuning)
+{
+  // first check if tuning is disabled, in which case we do nothing
+  static bool init = false;
+  static bool tune_disable = false;
+  if (!init) {
+    char *enable_tuning = getenv("QUDA_ENABLE_TUNING");
+    tune_disable = (enable_tuning && strcmp(enable_tuning, "0") == 0);
+    init = true;
+  }
+  if (!tune_disable) tune = tuning;
+}
+
+static std::stack<bool> tstack;
+
+void pushTuning(bool tuning)
+{
+  tstack.push(getTuning());
+  setTuning(tuning);
+}
+
+void popTuning()
+{
+  if (tstack.empty()) errorQuda("popTuning() called with empty stack");
+  setTuning(tstack.top());
+  tstack.pop();
+}
+
 void setOutputPrefix(const char *prefix)
 {
   strncpy(prefix_, prefix, MAX_PREFIX_SIZE);
@@ -80,7 +109,6 @@ void setOutputFile(FILE *outfile)
   outfile_ = outfile;
 }
 
-
 static std::stack<QudaVerbosity> vstack;
 
 void pushVerbosity(QudaVerbosity verbosity)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2beb90ceae..da3e7a97bf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -110,7 +110,7 @@ if(QUDA_DIRAC_WILSON
   install(TARGETS deflated_invert_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 
-if(QUDA_DIRAC_STAGGERED)
+if(QUDA_DIRAC_STAGGERED OR QUDA_DIRAC_LAPLACE)
   add_executable(staggered_dslash_test staggered_dslash_test.cpp)
   target_link_libraries(staggered_dslash_test ${TEST_LIBS})
   quda_checkbuildtest(staggered_dslash_test QUDA_BUILD_ALL_TESTS)
@@ -291,6 +291,8 @@ if(QUDA_MPI OR QUDA_QMP)
   message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes")
   set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS}
     CACHE STRING "CTest Launcher command for QUDA's tests")
+else()
+  set(QUDA_TEST_NUM_PROCS 1)
 endif()
 
 # BLAS tests
@@ -1009,19 +1011,15 @@ if(QUDA_DIRAC_WILSON)
     --enable-testing true
     --gtest_output=xml:invert_test_wilson.xml)
 
-  if(DEFINED ENV{QUDA_ENABLE_TUNING})
-    if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-      add_test(NAME invert_test_splitgrid_wilson
-        COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-        --dslash-type wilson --ngcrkrylov 8
-        --dim 2 4 6 8 --niter 1000
-        --nsrc ${QUDA_TEST_NUM_PROCS}
-        --enable-testing true
-        --gtest_output=xml:invert_test_splitgrid_wilson.xml)
+  add_test(NAME invert_test_splitgrid_wilson
+    COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+    --dslash-type wilson --ngcrkrylov 8
+    --dim 2 4 6 8 --niter 1000
+    --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
+    --enable-testing true
+    --gtest_output=xml:invert_test_splitgrid_wilson.xml)
 
-      set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-    endif()
-  endif()
+  set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 endif()
   
 if(QUDA_DIRAC_TWISTED_MASS)
@@ -1186,19 +1184,15 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true
       --gtest_output=xml:invert_test_staggered_${prec}.xml)
 
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_staggered_${prec}
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
-          --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
-
-        set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
+    add_test(NAME invert_test_splitgrid_staggered_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true
+      --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS}
+      --enable-testing true
+      --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml)
+
+    set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 
     add_test(NAME invert_test_asqtad_${prec}
       COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
@@ -1207,19 +1201,13 @@ foreach(prec IN LISTS TEST_PRECS)
       --enable-testing true --nsrc 4 --nsrc-tile 4
       --gtest_output=xml:invert_test_asqtad_${prec}.xml)
 
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_asqtad_${prec}
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
-          --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
-
-        set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
+    add_test(NAME invert_test_splitgrid_asqtad_${prec}
+      COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_invert_test> ${MPIEXEC_POSTFLAGS}
+      --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true
+      --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+      --nsrc ${QUDA_TEST_NUM_PROCS}
+      --enable-testing true
+      --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml)
 
     if (QUDA_DIRAC_LAPLACE)
       add_test(NAME invert_test_laplace_${prec}
@@ -1243,21 +1231,6 @@ if (QUDA_DIRAC_DISTANCE_PRECONDITIONING)
       --distance-pc-alpha0 0.1 --distance-pc-t0 4
       --enable-testing true
       --gtest_output=xml:invert_test_wilson_distance_pc.xml)
-
-    if(DEFINED ENV{QUDA_ENABLE_TUNING})
-      if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
-        add_test(NAME invert_test_splitgrid_wilson
-          COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
-          --dslash-type wilson --ngcrkrylov 8
-          --dim 2 4 6 8 --niter 1000
-          --distance-pc-alpha0 0.1 --distance-pc-t0 4
-          --nsrc ${QUDA_TEST_NUM_PROCS}
-          --enable-testing true
-          --gtest_output=xml:invert_test_splitgrid_wilson_distance_pc.xml)
-
-        set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
-      endif()
-    endif()
   endif()
 
   if(QUDA_DIRAC_CLOVER)
diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp
index 4a9edb57b2..5847bec40a 100644
--- a/tests/clover_force_test.cpp
+++ b/tests/clover_force_test.cpp
@@ -127,7 +127,7 @@ std::tuple<int, double> clover_force_test(test_t param)
   gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER;
   gauge_param.overwrite_mom = 1;
 
-  if (getTuning() == QUDA_TUNE_YES)
+  if (getTuning())
     computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param,
                              detratio);
 
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index 3d77668d97..ec4ca5b13a 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -1062,12 +1062,12 @@ struct DslashTestWrapper {
                  1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min, 2 * ghost_bytes);
 
       ::testing::Test::RecordProperty("Gflops", std::to_string(1.0e-9 * flops / dslash_time.event_time));
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
       ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes);
     }
   }
diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp
index e325511e05..a9561ca2a5 100644
--- a/tests/eigensolve_test.cpp
+++ b/tests/eigensolve_test.cpp
@@ -191,17 +191,14 @@ std::vector<double> eigensolve(test_t test_param)
   // Host side arrays to store the eigenpairs computed by QUDA
   int n_eig = eig_n_conv;
   if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2;
-  std::vector<quda::ColorSpinorField> evecs(n_eig);
   quda::ColorSpinorParam cs_param;
   constructWilsonTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param);
 
   // Void pointers to host side arrays, compatible with the QUDA interface.
   std::vector<void *> host_evecs_ptr(n_eig);
   // Allocate host side memory and pointers
-  for (int i = 0; i < n_eig; i++) {
-    evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].data();
-  }
+  std::vector<quda::ColorSpinorField> evecs(n_eig, cs_param);
+  for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data();
 
   // Complex eigenvalues
   std::vector<__complex__ double> evals(eig_n_conv);
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 1efa49ef97..c7820d8575 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -179,7 +179,7 @@ void gauge_force_test(force_test_t test_param)
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
 
-  if (getTuning() == QUDA_TUNE_YES) {
+  if (getTuning()) {
     if (compute_force)
       computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3,
                             &gauge_param);
@@ -324,7 +324,7 @@ void gauge_loop_test(loop_test_t loop_param)
   // compute the plaquette as part of validation
   obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE;
 
-  if (getTuning() == QUDA_TUNE_YES) { gaugeObservablesQuda(&obsParam); }
+  if (getTuning()) { gaugeObservablesQuda(&obsParam); }
 
   quda::host_timer_t host_timer;
   // Multiple execution to exclude warmup time in the first run
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 5bada81761..9b6ef102c8 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -746,17 +746,17 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx)
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx)
 {
   std::vector<quda::ColorSpinorField> out_vector(1);
   out_vector[0] = out;
-  return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, src_idx);
+  return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, laplace3D, src_idx);
 }
 
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
                                                std::vector<quda::ColorSpinorField> &out_vector,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx)
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx)
 {
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   double l2r_max = 0.0;
@@ -783,7 +783,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
     for (int i = 0; i < multishift; i++) {
       auto &out = out_vector[i];
       double mass = 0.5 * sqrt(inv_param.offset[i]);
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D);
 
       mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
       double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
@@ -809,10 +809,7 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
     auto &out = out_vector[0];
     double mass = inv_param.mass;
     if (inv_param.solution_type == QUDA_MAT_SOLUTION) {
-      stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
-
-      // correct for the massRescale function inside invertQuda
-      if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision());
+      stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D);
     } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) {
       QudaParity parity = QUDA_INVALID_PARITY;
       switch (inv_param.matpc_type) {
@@ -820,9 +817,9 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
       case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
       default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
       }
-      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type);
+      stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D);
     } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-      stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type);
+      stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D);
     } else {
       errorQuda("Invalid staggered solution type %d", inv_param.solution_type);
     }
@@ -844,8 +841,9 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
   return {l2r_max, hqr_max};
 }
 
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
-                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link)
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector<double _Complex> &lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                      int laplace3D)
 {
   QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
@@ -861,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
     else
       sol_type = QUDA_MATDAG_MAT_SOLUTION;
   } else {
-    if (use_pc)
-      sol_type = QUDA_MATPC_SOLUTION;
-    else
-      sol_type = QUDA_MAT_SOLUTION;
+    sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
   }
 
   // Create temporary spinors
@@ -872,7 +867,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
   quda::ColorSpinorField ref(csParam);
 
   if (sol_type == QUDA_MAT_SOLUTION) {
-    stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
+    stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D);
   } else if (sol_type == QUDA_MATPC_SOLUTION) {
     QudaParity parity = QUDA_INVALID_PARITY;
     switch (inv_param.matpc_type) {
@@ -880,26 +875,60 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
     case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break;
     default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break;
     }
-    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type);
+    stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type, laplace3D);
   } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) {
-    stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type);
+    stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D);
   }
 
-  // Compute M * x - \lambda * x
-  caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
-  double l2r = sqrt(nrm2 / src2);
+  if (laplace3D == 3) {
+    auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size;
+    auto t_offset = spinor.X()[3] * comm_coord(3);
+    std::vector<double> nrm2(spinor.X()[3] * comm_dim(3), 0.0);
+    std::vector<double> src2(spinor.X()[3] * comm_dim(3), 0.0);
+    // Compute M * x - \lambda * x on each slice
+    for (auto t = 0; t < spinor.X()[3]; t++) {
+      auto t_global = t_offset + t;
+      auto offset = t * batch_size * inv_param.cpu_prec;
 
-  printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+      for (int parity = 0; parity < spinor.SiteSubset(); parity++) {
+        caxpy(-lambda[t_global], static_cast<char *>(spinor.data()) + offset, static_cast<char *>(ref.data()) + offset,
+              batch_size, inv_param.cpu_prec);
 
-  return l2r;
+        nrm2[t_global] += norm_2(static_cast<char *>(ref.data()) + offset, batch_size, inv_param.cpu_prec, false);
+        src2[t_global] += norm_2(static_cast<char *>(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false);
+
+        offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec;
+      }
+    }
+
+    comm_allreduce_sum(nrm2);
+    comm_allreduce_sum(src2);
+
+    for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) {
+      auto l = ((double *)&(lambda[t]))[0];
+      printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol,
+                 sqrt(nrm2[t] / src2[t]));
+    }
+    auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0);
+    auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0);
+
+    return sqrt(total_nrm2 / total_src2);
+  } else {
+    // Compute M * x - \lambda * x
+    caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+    double l2r = sqrt(nrm2 / src2);
+    printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r);
+    return l2r;
+  }
 }
 
 double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
-                                         double _Complex sigma, int i, QudaEigParam &eig_param,
-                                         quda::GaugeField &fat_link, quda::GaugeField &long_link)
+                                         const std::vector<double _Complex> &sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D)
 {
+  if (laplace3D == 3) errorQuda("3-d Laplace operator not supported");
   QudaInvertParam &inv_param = *(eig_param.invert_param);
   int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0;
   bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false);
@@ -912,10 +941,10 @@ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, qu
   quda::ColorSpinorField ref(csParam);
 
   // Only `mat` is used here
-  stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type);
+  stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type, laplace3D);
 
   // Compute M * x_left - \sigma * x_right
-  caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
+  caxpy(-sigma[0], spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec);
   double l2r = sqrt(nrm2 / src2);
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 3c9543107e..307bb1db70 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -217,11 +217,13 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
  * @param inv_param Invert params, used to query the solve type, etc
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
+ * @param src_idx The source index we working on (when doing mutil-RHS)
  * @return The residual and HQ residual (if requested)
  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx);
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx);
 
 /**
  * @brief Verify a single- or multi-shift staggered inversion on the host
@@ -231,26 +233,30 @@ std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in, quda:
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
  * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
+ * @param src_idx The source index we working on (when doing mutil-RHS)
  * @return The residual and HQ residual (if requested)
  */
 std::array<double, 2> verifyStaggeredInversion(quda::ColorSpinorField &in,
                                                std::vector<quda::ColorSpinorField> &out_vector,
                                                quda::GaugeField &fat_link, quda::GaugeField &long_link,
-                                               QudaInvertParam &inv_param, int src_idx = 0);
+                                               QudaInvertParam &inv_param, int laplace3D, int src_idx = 0);
 
 /**
  * @brief Verify a staggered-type eigenvector
  *
  * @param spinor The host eigenvector to be verified
- * @param lambda The host eigenvalue to be verified
+ * @param lambda The host eigenvalue(s) to be verified
  * @param i The number of the eigenvalue, only used when printing outputs
  * @param eig_param Eigensolve params, used to query the operator type, etc
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
  * @return The residual norm
  */
-double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i,
-                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link);
+double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector<double _Complex> &lambda, int i,
+                                      QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link,
+                                      int laplace3D);
 
 /**
  * @brief Verify a staggered-type singular vector
@@ -262,11 +268,12 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co
  * @param eig_param Eigensolve params, used to query the operator type, etc
  * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied
  * @param long_link The long links; null for naive staggered and Laplace
+ * @param laplace3D Whether we are working on the 3-d Laplace operator
  * @return The residual norm
  */
 double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right,
-                                         double _Complex sigma, int i, QudaEigParam &eig_param,
-                                         quda::GaugeField &fat_link, quda::GaugeField &long_link);
+                                         const std::vector<double _Complex> &sigma, int i, QudaEigParam &eig_param,
+                                         quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D);
 
 /**
  * @brief Verify the spinor distance reweighting
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 6ecbfe94de..2e7e4e0f8a 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -29,13 +29,19 @@
  * @param[in] oddBit The odd/even bit for the site index
  * @param[in] daggerBit Perform the ordinary dslash (0) or Hermitian conjugate (1)
  * @param[in] dslash_type The type of Dslash operation
+ * @param[in] laplace3D Whether we applying the 3-d laplace operator
+ * (in the case of dslash_type being QUDA_LAPLACE_DSLASH)
  */
 template <typename real_t>
 void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const real_t *const *longlink,
                               const real_t *const *ghostFatlink, const real_t *const *ghostLonglink,
                               const real_t *spinorField, const real_t *const *fwd_nbr_spinor,
-                              const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type)
+                              const real_t *const *back_nbr_spinor, int oddBit, int daggerBit,
+                              QudaDslashType dslash_type, int laplace3D)
 {
+  if (laplace3D < 4 && dslash_type != QUDA_LAPLACE_DSLASH)
+    errorQuda("laplace3D = %d only supported for Laplace dslash (%d requested)", laplace3D, dslash_type);
+
 #pragma omp parallel for
   for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0;
 
@@ -66,6 +72,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r
     int offset = stag_spinor_site_size * sid;
 
     for (int dir = 0; dir < 8; dir++) {
+      if (laplace3D == dir / 2) continue; // skip dimensions if needed
       const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1;
       const real_t *fatlnk
         = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1);
@@ -108,7 +115,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r
 }
 
 void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type)
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
   // assert sPrecision and gPrecision must be the same
   if (in.Precision() != fat_link.Precision()) {
@@ -143,23 +150,22 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
                              reinterpret_cast<double **>(qdp_longlink), reinterpret_cast<double **>(ghost_fatlink),
                              reinterpret_cast<double **>(ghost_longlink), in.data<double *>(),
                              reinterpret_cast<double **>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+                             reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type,
+                             laplace3D);
   } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
     staggeredDslashReference(out.data<float*>(), reinterpret_cast<float **>(qdp_fatlink),
                              reinterpret_cast<float **>(qdp_longlink), reinterpret_cast<float **>(ghost_fatlink),
                              reinterpret_cast<float **>(ghost_longlink), in.data<float *>(),
                              reinterpret_cast<float **>(in.fwdGhostFaceBuffer),
-                             reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type);
+                             reinterpret_cast<float **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type,
+                             laplace3D);
   }
 }
 
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) {
-    errorQuda("The spinor precision and gauge precision are not the same");
-  }
+  checkPrecision(in, fat_link);
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -169,11 +175,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
   // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not
   // have the minus sign. Inverting the expected dagger convention
   // solves this discrepancy.
-  stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type);
-  stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type);
+  stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type, laplace3D);
+  stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D);
 
   if (dslash_type == QUDA_LAPLACE_DSLASH) {
-    double kappa = 1.0 / (8 + mass);
+    int dimension = laplace3D < 4 ? 3 : 4;
+    double kappa = 1.0 / (2 * dimension + mass);
     xpay(in.data(), kappa, out.data(), out.Length(), out.Precision());
   } else {
     axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision());
@@ -181,12 +188,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
 }
 
 void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type)
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) {
-    errorQuda("The spinor precision and gauge precision are not the same");
-  }
+  checkPrecision(in, fat_link);
 
   // assert we have full-parity spinors
   if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET)
@@ -197,15 +201,16 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga
   quda::ColorSpinorField tmp(csParam);
 
   // Apply mat in sequence
-  stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type);
-  stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type);
+  stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type, laplace3D);
+  stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type, laplace3D);
 }
 
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type)
+                const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type,
+                int laplace3D)
 {
-  // assert sPrecision and gPrecision must be the same
-  if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); }
+  if (laplace3D < 4) errorQuda("Cannot use 3-d operator with e/o preconditioning");
+  checkPrecision(in, fat_link);
 
   // assert we have single-parity spinors
   if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET)
@@ -225,8 +230,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi
   quda::ColorSpinorField tmp(csParam);
 
   // dagger bit does not matter
-  stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type);
-  stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type);
+  stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type, laplace3D);
+  stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type, laplace3D);
 
   double msq_x4 = mass * mass * 4;
   if (in.Precision() == QUDA_DOUBLE_PRECISION) {
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 34625abfa3..557be21bd0 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -21,9 +21,13 @@ void setDims(int *);
  * @param[in] oddBit 0 for D_eo, 1 for D_oe
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type);
+                 const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the full parity staggered-type dslash
@@ -35,9 +39,13 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
  * @param[in] mass Mass for the dslash operator
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
+              const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the full parity staggered-type matdag_mat
@@ -49,9 +57,13 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel
  * @param[in] mass Mass for the dslash operator
  * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type);
+                     const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D);
 
 /**
  * @brief Apply the even-even or odd-odd preconditioned staggered dslash
@@ -64,6 +76,11 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga
  * @param[in] dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator
  * @param[in] parity Parity of preconditioned dslash
  * @param[in] dslash_type Dslash type
+ * @param[in] laplace3D Whether we applying the 3-d variant of the
+ * Laplace operator.  A value of 4 is the regular 4-d operator, and a
+ * value < 4 implies a 3-d operator with the value designating the
+ * orthogonal dimension (e.g., the dimension not enabled).
  */
 void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link,
-                const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type);
+                const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type,
+                int laplace3D);
diff --git a/tests/io_test.cpp b/tests/io_test.cpp
index 2618b0e479..25296b02b9 100644
--- a/tests/io_test.cpp
+++ b/tests/io_test.cpp
@@ -31,6 +31,9 @@ TEST_P(GaugeIOTest, verify)
   gauge_param.cpu_prec = ::testing::get<0>(param);
   if (!quda::is_enabled(gauge_param.cpu_prec)) GTEST_SKIP();
   gauge_param.cuda_prec = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_sloppy = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_precondition = gauge_param.cpu_prec;
+  gauge_param.cuda_prec_eigensolver = gauge_param.cpu_prec;
   gauge_param.t_boundary = QUDA_PERIODIC_T;
 
   // Allocate host side memory for the gauge field.
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index a9b499d71a..1b5609bfda 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -85,14 +85,16 @@ struct StaggeredDslashTestWrapper {
     for (int i = 0; i < Nsrc; i++) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type);
+        stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type, laplace3D);
         break;
       case dslash_test_type::MatPC:
-        stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type);
+        stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type, laplace3D);
+        break;
+      case dslash_test_type::Mat:
+        stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D);
         break;
-      case dslash_test_type::Mat: stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); break;
       case dslash_test_type::MatDagMat:
-        stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type);
+        stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D);
         break;
       default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
       }
@@ -406,12 +408,12 @@ struct StaggeredDslashTestWrapper {
 
       size_t ghost_bytes = cudaSpinor[0].GhostBytes();
 
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU",
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU",
                                       1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
-      ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max);
+      ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min);
       ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes);
 
       printfQuda(
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 25d10cf91a..95ebff044b 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -75,12 +75,6 @@ void init()
   gauge_param = newQudaGaugeParam();
   setStaggeredGaugeParam(gauge_param);
 
-  QudaGaugeSmearParam smear_param;
-  if (gauge_smear) {
-    smear_param = newQudaGaugeSmearParam();
-    setGaugeSmearParam(smear_param);
-  }
-
   // Though no inversions are performed, the inv_param
   // structure contains all the information we need to
   // construct the dirac operator.
@@ -166,10 +160,20 @@ std::vector<double> eigensolve(test_t test_param)
   eig_param.compute_svd = ::testing::get<3>(test_param);
   eig_param.spectrum = ::testing::get<4>(test_param);
 
-  if (eig_param.use_pc)
-    eig_inv_param.solution_type = QUDA_MATPC_SOLUTION;
-  else
-    eig_inv_param.solution_type = QUDA_MAT_SOLUTION;
+  eig_inv_param.solution_type = eig_param.use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION;
+
+  // whether we are using the resident smeared gauge or not
+  eig_param.use_smeared_gauge = gauge_smear;
+
+  if (dslash_type == QUDA_LAPLACE_DSLASH) {
+    int dimension = laplace3D < 4 ? 3 : 4;
+    eig_inv_param.kappa = 1.0 / (2 * dimension + mass);
+    eig_inv_param.laplace3D = laplace3D;
+    if (dimension == 3) {
+      eig_param.ortho_dim = laplace3D;
+      eig_param.ortho_dim_size_local = tdim;
+    }
+  }
 
   // For gtest testing, we prohibit the use of polynomial acceleration as
   // the fine tuning required can inhibit convergence of an otherwise
@@ -192,24 +196,65 @@ std::vector<double> eigensolve(test_t test_param)
 
   if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param);
 
+  // Gauge Smearing Routines
+  if (gauge_smear) {
+    quda::host_timer_t host_timer;
+    host_timer.start(); // start the timer
+
+    std::vector<QudaGaugeObservableParam> obs_param(gauge_smear_steps / measurement_interval + 1);
+    for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+      obs_param[i] = newQudaGaugeObservableParam();
+      obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
+      obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE;
+      obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
+    }
+
+    // We here set all the problem parameters for all possible smearing types.
+    QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam();
+    setGaugeSmearParam(smear_param);
+
+    switch (smear_param.smear_type) {
+    case QUDA_GAUGE_SMEAR_APE:
+    case QUDA_GAUGE_SMEAR_STOUT:
+    case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+    case QUDA_GAUGE_SMEAR_HYP: {
+      performGaugeSmearQuda(&smear_param, obs_param.data());
+      break;
+    }
+
+      // Here we use a typical use case which is different from simple smearing in that
+      // the user will want to compute the plaquette values to compute the gauge energy.
+    case QUDA_GAUGE_SMEAR_WILSON_FLOW:
+    case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: {
+      for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
+        obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
+      }
+      performWFlowQuda(&smear_param, obs_param.data());
+      break;
+    }
+    default: errorQuda("Undefined gauge smear type %d given", smear_param.smear_type);
+    }
+
+    host_timer.stop();
+    printfQuda("Time for gauge smearing = %f\n", host_timer.last());
+  }
+
   // Vector construct START
   //----------------------------------------------------------------------------
   // Host side arrays to store the eigenpairs computed by QUDA
   int n_eig = eig_n_conv;
   if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2;
-  std::vector<quda::ColorSpinorField> evecs(n_eig);
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param);
   // Void pointers to host side arrays, compatible with the QUDA interface.
   std::vector<void *> host_evecs_ptr(n_eig);
   // Allocate host side memory and pointers
-  for (int i = 0; i < n_eig; i++) {
-    evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].data();
-  }
+  std::vector<quda::ColorSpinorField> evecs(n_eig, cs_param);
+  for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data();
 
   // Complex eigenvalues
-  std::vector<__complex__ double> evals(eig_n_conv);
+  int n_batch = laplace3D == 3 ? eig_param.ortho_dim_size_local * comm_dim(3) : 1;
+  std::vector<__complex__ double> evals(eig_n_conv * n_batch);
   // Vector construct END
   //----------------------------------------------------------------------------
 
@@ -226,19 +271,20 @@ std::vector<double> eigensolve(test_t test_param)
   printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last());
 
   // Perform host side verification of eigenvector if requested.
-  // ...
 
   std::vector<double> residua(eig_n_conv, 0.0);
   // Perform host side verification of eigenvector if requested.
   if (verify_results) {
     for (int i = 0; i < eig_n_conv; i++) {
       if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
-        double _Complex sigma = evals[i];
+        std::vector<double _Complex> sigma(n_batch);
+        for (auto b = 0; b < n_batch; b++) sigma[b] = evals[b * eig_n_conv + i];
         residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP,
-                                                       cpuLongQDP);
+                                                       cpuLongQDP, laplace3D);
       } else {
-        double _Complex lambda = evals[i];
-        residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP);
+        std::vector<double _Complex> lambda(n_batch);
+        for (auto b = 0; b < n_batch; b++) lambda[b] = evals[b * eig_n_conv + i];
+        residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP, laplace3D);
       }
     }
   }
@@ -285,7 +331,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
+    if (is_laplace(dslash_type))
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
     if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index c73c85378e..8918c3df4d 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -32,6 +32,9 @@ bool skip_test(test_t test_param)
   auto compute_svd = ::testing::get<3>(test_param);
   auto spectrum = ::testing::get<4>(test_param);
 
+  // 3-d operator only supported for Laplace
+  if (eig_type == QUDA_EIG_TR_LANCZOS_3D && dslash_type != QUDA_LAPLACE_DSLASH) return true;
+
   // Reverse engineer the operator type
   QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd);
   if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) {
@@ -87,6 +90,7 @@ bool skip_test(test_t test_param)
     case QUDA_LAPLACE_DSLASH:
       switch (eig_type) {
       case QUDA_EIG_TR_LANCZOS:
+      case QUDA_EIG_TR_LANCZOS_3D:
       case QUDA_EIG_BLK_TR_LANCZOS:
         if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true;
         break;
@@ -117,6 +121,8 @@ TEST_P(StaggeredEigensolveTest, verify)
   auto eig_type = ::testing::get<0>(GetParam());
   if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10;
   auto tol = factor * eig_param.tol;
+  if (dslash_type == QUDA_LAPLACE_DSLASH) laplace3D = eig_type == QUDA_EIG_TR_LANCZOS_3D ? 3 : 4;
+
   for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol);
 }
 
@@ -143,6 +149,9 @@ auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QU
 // Can solve non-hermitian systems
 auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI);
 
+// Batched solvers for 3-d operators
+auto batched_solvers = Values(QUDA_EIG_TR_LANCZOS_3D);
+
 // Eigensolver spectrum types
 auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG);
 auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG,
@@ -171,3 +180,9 @@ INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest,
                          ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
                                             Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum),
                          gettestname);
+
+// 3-d full system direct solve
+INSTANTIATE_TEST_SUITE_P(DirectFull3D, StaggeredEigensolveTest,
+                         ::testing::Combine(batched_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE),
+                                            Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum),
+                         gettestname);
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 4177feb954..77d9cd5a75 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -450,9 +450,9 @@ std::vector<std::array<double, 2>> solve(test_t param)
         // Create an appropriate subset of the full out_multishift vector
         std::vector<quda::ColorSpinorField> out_subset
           = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift};
-        res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param);
+        res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param, laplace3D);
       } else {
-        res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, n);
+        res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, laplace3D, n);
       }
     }
   }
@@ -510,7 +510,8 @@ int main(int argc, char **argv)
     if (!is_staggered(dslash_type) && !is_laplace(dslash_type))
       errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   } else {
-    if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
+    if (is_laplace(dslash_type))
+      errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON");
     if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type));
   }
 
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index 265fed1248..5ddfc52956 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -18,19 +18,6 @@
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-// Smearing variables
-double gauge_smear_rho = 0.1;
-double gauge_smear_epsilon = 0.1;
-double gauge_smear_alpha = 0.6;
-double gauge_smear_alpha1 = 0.75;
-double gauge_smear_alpha2 = 0.6;
-double gauge_smear_alpha3 = 0.3;
-int gauge_smear_steps = 50;
-QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
-int gauge_smear_dir_ignore = -1;
-int measurement_interval = 5;
-bool su_project = true;
-
 void display_test_info()
 {
   printfQuda("running the following test:\n");
@@ -68,49 +55,6 @@ void display_test_info()
   return;
 }
 
-void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
-{
-  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
-                                                                {"stout", QUDA_GAUGE_SMEAR_STOUT},
-                                                                {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
-                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
-                                                                {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
-                                                                {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
-
-  // Option group for SU(3) related options
-  auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests");
-
-  opgroup
-    ->add_option(
-      "--su3-smear-type",
-      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
-    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
-  ;
-  opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
-
-  opgroup->add_option("--su3-smear-rho", gauge_smear_rho,
-                      "rho coefficient for Stout and Over-Improved Stout smearing (default 0.1)");
-
-  opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon,
-                      "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)");
-
-  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
-  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
-  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
-
-  opgroup->add_option(
-    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
-    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
-
-  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");
-
-  opgroup->add_option("--su3-measurement-interval", measurement_interval,
-                      "Measure the field energy and/or topological charge every Nth step (default 5) ");
-
-  opgroup->add_option("--su3-project", su_project,
-                      "Project smeared gauge onto su3 manifold at measurement interval (default true)");
-}
-
 int main(int argc, char **argv)
 {
 
@@ -257,7 +201,7 @@ int main(int argc, char **argv)
   QudaGaugeObservableParam *obs_param = new QudaGaugeObservableParam[gauge_smear_steps / measurement_interval + 1];
   for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) {
     obs_param[i] = newQudaGaugeObservableParam();
-    obs_param[i].compute_plaquette = QUDA_BOOLEAN_FALSE;
+    obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE;
     obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE;
     obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
   }
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 43d2745035..6cfe9cf0ad 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -73,7 +73,6 @@ quda::mgarray<bool> mg_vec_partfile = {};
 QudaInverterType inv_type;
 bool inv_deflate = false;
 bool inv_multigrid = false;
-bool gauge_smear = false;
 QudaInverterType precon_type = QUDA_INVALID_INVERTER;
 QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ;
 QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR;
@@ -302,13 +301,18 @@ double eofa_mq2 = 0.85;
 double eofa_mq3 = 1.0;
 
 // SU(3) smearing options
+bool gauge_smear = false;
+QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
 double gauge_smear_rho = 0.1;
-double gauge_smear_epsilon = 1.0;
+double gauge_smear_epsilon = 0.1;
 double gauge_smear_alpha = 0.6;
+double gauge_smear_alpha1 = 0.75;
+double gauge_smear_alpha2 = 0.6;
+double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 5;
-QudaWFlowType wflow_type = QUDA_WFLOW_TYPE_WILSON;
+int gauge_smear_dir_ignore = -1;
 int measurement_interval = 5;
-QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
+bool su_project = true;
 
 // contract options
 QudaContractType contract_type = QUDA_CONTRACT_TYPE_STAGGERED_FT_T;
@@ -410,6 +414,7 @@ namespace
 
   CLI::TransformPairs<QudaEigType> eig_type_map {{"trlm", QUDA_EIG_TR_LANCZOS},
                                                  {"blktrlm", QUDA_EIG_BLK_TR_LANCZOS},
+                                                 {"trlm-3d", QUDA_EIG_TR_LANCZOS_3D},
                                                  {"iram", QUDA_EIG_IR_ARNOLDI},
                                                  {"blkiram", QUDA_EIG_BLK_IR_ARNOLDI}};
 
@@ -453,11 +458,12 @@ namespace
     {"SR", QUDA_SPECTRUM_SR_EIG}, {"LR", QUDA_SPECTRUM_LR_EIG}, {"SM", QUDA_SPECTRUM_SM_EIG},
     {"LM", QUDA_SPECTRUM_LM_EIG}, {"SI", QUDA_SPECTRUM_SI_EIG}, {"LI", QUDA_SPECTRUM_LI_EIG}};
 
-  CLI::TransformPairs<QudaWFlowType> wflow_type_map {{"wilson", QUDA_WFLOW_TYPE_WILSON},
-                                                     {"symanzik", QUDA_WFLOW_TYPE_SYMANZIK}};
-
-  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {
-    {"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovr-imp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}};
+  CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
+                                                                {"stout", QUDA_GAUGE_SMEAR_STOUT},
+                                                                {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
+                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
+                                                                {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
+                                                                {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
 
   CLI::TransformPairs<QudaSetupType> setup_type_map {{"test", QUDA_TEST_VECTOR_SETUP}, {"null", QUDA_TEST_VECTOR_SETUP}};
 
@@ -765,7 +771,6 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
 
 void add_eigen_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-
   CLI::QUDACheckedTransformer prec_transform(precision_map);
   // Option group for Eigensolver related options
   auto opgroup = quda_app->add_option_group("Eigensolver", "Options controlling eigensolver");
@@ -1118,9 +1123,16 @@ void add_eofa_option_group(std::shared_ptr<QUDAApp> quda_app)
 
 void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-
   // Option group for SU(3) related options
   auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests");
+
+  opgroup
+    ->add_option(
+      "--su3-smear-type",
+      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
+    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
+  ;
+
   opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
 
   opgroup->add_option("--su3-smear-rho", gauge_smear_rho,
@@ -1128,18 +1140,23 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 
   opgroup->add_option(
     "--su3-smear-epsilon", gauge_smear_epsilon,
-    "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 1.0)");
+    "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 0.1)");
 
-  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)");
+  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
+  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
+  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
 
-  opgroup->add_option("--su3-wflow-type", wflow_type, "The type of action to use in the wilson flow (default wilson)")
-    ->transform(CLI::QUDACheckedTransformer(wflow_type_map));
+  opgroup->add_option(
+    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
+    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
 
-  opgroup->add_option("--su3-smear-type", gauge_smear_type, "The type of smearing to use (default stout)")
-    ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
+  opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)");
 
   opgroup->add_option("--su3-measurement-interval", measurement_interval,
-                      "Measure the field energy and topological charge every Nth step (default 5) ");
+                      "Measure the field energy and/or topological charge every Nth step (default 5) ");
+
+  opgroup->add_option("--su3-project", su_project,
+                      "Project smeared gauge onto su3 manifold at measurement interval (default true)");
 }
 
 void add_madwf_option_group(std::shared_ptr<QUDAApp> quda_app)
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index 338dcc73bc..38e81ebdc0 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -554,10 +554,14 @@ extern double eofa_mq3;
 extern double gauge_smear_rho;
 extern double gauge_smear_epsilon;
 extern double gauge_smear_alpha;
+extern double gauge_smear_alpha1;
+extern double gauge_smear_alpha2;
+extern double gauge_smear_alpha3;
 extern int gauge_smear_steps;
-extern QudaWFlowType wflow_type;
+extern int gauge_smear_dir_ignore;
 extern int measurement_interval;
 extern QudaGaugeSmearType gauge_smear_type;
+extern bool su_project;
 
 extern double smear_coeff;
 extern int    smear_n_steps;
diff --git a/tests/utils/host_blas.cpp b/tests/utils/host_blas.cpp
index c9d8c63a59..e918f77589 100644
--- a/tests/utils/host_blas.cpp
+++ b/tests/utils/host_blas.cpp
@@ -63,20 +63,20 @@ void mxpy(void *x, void *y, int len, QudaPrecision precision)
 }
 
 // returns the square of the L2 norm of the vector
-template <typename real_t> inline double norm2(real_t *v, int len)
+template <typename real_t> inline double norm2(real_t *v, int len, bool global)
 {
   double sum = 0.0;
   for (int i = 0; i < len; i++) sum += v[i] * v[i];
-  quda::comm_allreduce_sum(sum);
+  if (global) quda::comm_allreduce_sum(sum);
   return sum;
 }
 
-double norm_2(void *v, int len, QudaPrecision precision)
+double norm_2(void *v, int len, QudaPrecision precision, bool global)
 {
   if (precision == QUDA_DOUBLE_PRECISION)
-    return norm2((double *)v, len);
+    return norm2((double *)v, len, global);
   else
-    return norm2((float *)v, len);
+    return norm2((float *)v, len, global);
 }
 
 // performs the operation y[i] = x[i] + a*y[i]
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index c63d124dc8..473a527ac5 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -250,7 +250,7 @@ void exchange_llfat_init(QudaPrecision prec);
 void exchange_llfat_cleanup(void);
 
 // Implemented in host_blas.cpp
-double norm_2(void *vector, int len, QudaPrecision precision);
+double norm_2(void *vector, int len, QudaPrecision precision, bool global = true);
 void mxpy(void *x, void *y, int len, QudaPrecision precision);
 void ax(double a, void *x, int len, QudaPrecision precision);
 void cax(double _Complex a, void *x, int len, QudaPrecision precision);
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index 8ec29c514b..58cc3fb758 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -221,6 +221,7 @@ const char *get_eig_type_str(QudaEigType type)
   switch (type) {
   case QUDA_EIG_TR_LANCZOS: ret = "trlm"; break;
   case QUDA_EIG_BLK_TR_LANCZOS: ret = "blktrlm"; break;
+  case QUDA_EIG_TR_LANCZOS_3D: ret = "trlm_3d"; break;
   case QUDA_EIG_IR_ARNOLDI: ret = "iram"; break;
   case QUDA_EIG_BLK_IR_ARNOLDI: ret = "blkiram"; break;
   default: ret = "unknown eigensolver"; break;
diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp
index 7dd43a651a..98bf9da590 100644
--- a/tests/utils/set_params.cpp
+++ b/tests/utils/set_params.cpp
@@ -5,12 +5,16 @@
 
 void setGaugeSmearParam(QudaGaugeSmearParam &smear_param)
 {
+  smear_param.smear_type = gauge_smear_type;
   smear_param.alpha = gauge_smear_alpha;
   smear_param.rho = gauge_smear_rho;
   smear_param.epsilon = gauge_smear_epsilon;
   smear_param.n_steps = gauge_smear_steps;
   smear_param.meas_interval = measurement_interval;
-  smear_param.smear_type = gauge_smear_type;
+  smear_param.alpha1 = gauge_smear_alpha1;
+  smear_param.alpha2 = gauge_smear_alpha2;
+  smear_param.alpha3 = gauge_smear_alpha3;
+  smear_param.dir_ignore = gauge_smear_dir_ignore;
   smear_param.struct_size = sizeof(smear_param);
 }
 
@@ -125,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param)
 {
   // Set dslash type
   inv_param.dslash_type = dslash_type;
+  int dimension = laplace3D < 4 ? 3 : 4;
 
   // Use kappa or mass normalisation
   if (kappa == -1.0) {
     inv_param.mass = mass;
     inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass));
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (8 + mass);
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass);
   } else {
     inv_param.kappa = kappa;
     inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy);
-    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 8.0;
+    if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension;
   }
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
     printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass);
@@ -935,7 +940,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   // Solver params
   inv_param.verbosity = verbosity;
   inv_param.mass = mass;
-  inv_param.kappa = kappa = 1.0 / (8.0 + mass); // for Laplace operator
+  int dimension = laplace3D < 4 ? 3 : 4;
+  inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator
   inv_param.laplace3D = laplace3D;              // for Laplace operator
 
   if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0)
@@ -1002,7 +1008,7 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param)
   inv_param.solve_type = solve_type;
   inv_param.matpc_type = matpc_type;
   inv_param.dagger = QUDA_DAG_NO;
-  inv_param.mass_normalization = QUDA_MASS_NORMALIZATION;
+  inv_param.mass_normalization = dslash_type == QUDA_LAPLACE_DSLASH ? QUDA_KAPPA_NORMALIZATION : QUDA_MASS_NORMALIZATION;
 
   inv_param.cpu_prec = cpu_prec;
   inv_param.cuda_prec = prec;