From bc889990f667280e19402d020c956bac0bc9f6fb Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Mon, 8 Jan 2024 01:16:14 +0800
Subject: [PATCH 01/22] Add `dir_ignore` to `QudaGaugeSmearParam`.

---
 include/gauge_tools.h           |  9 ++++++---
 include/kernels/gauge_ape.cuh   |  9 ++++++---
 include/kernels/gauge_stout.cuh | 12 +++++++----
 include/quda.h                  |  1 +
 lib/gauge_ape.cu                | 27 +++++++++++++++++--------
 lib/gauge_stout.cu              | 36 ++++++++++++++++++++++++---------
 lib/interface_quda.cpp          |  6 +++---
 7 files changed, 69 insertions(+), 31 deletions(-)
diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 9b7d68db37..80d3a4f12b 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -95,8 +95,9 @@ namespace quda
      @param[out] dataDs Output smeared field
      @param[in] dataOr Input gauge field
      @param[in] alpha smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha);
+  void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha, int dir_ignore);
 
   /**
      @brief Apply STOUT smearing to the gauge field
@@ -104,8 +105,9 @@ namespace quda
      @param[out] dataDs Output smeared field
      @param[in] dataOr Input gauge field
      @param[in] rho smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho);
+  void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, int dir_ignore);
 
   /**
      @brief Apply Over Improved STOUT smearing to the gauge field
@@ -114,8 +116,9 @@ namespace quda
      @param[in] dataOr Input gauge field
      @param[in] rho smearing parameter
      @param[in] epsilon smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon);
+  void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore);
 
   /**
      @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field.
diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index 3bf490dfea..226f426f73 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -26,13 +26,15 @@ namespace quda
     int X[4];    // grid dimensions
     int border[4];
     const Float alpha;
+    const int dir_ignore;
     const Float tolerance;
 
-    GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha) :
+    GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) :
       kernel_param(dim3(in.LocalVolumeCB(), 2, apeDim)),
       out(out),
       in(in),
       alpha(alpha),
+      dir_ignore(dir_ignore),
       tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL)
     {
       for (int dir = 0; dir < 4; ++dir) {
@@ -61,16 +63,17 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       int dx[4] = {0, 0, 0, 0};
       Link U, Stap, TestU, I;
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
-      computeStaple(arg, x, X, parity, dir, Stap, Arg::apeDim);
+      computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndexShift(x, dx, X), parity);
     
-      Stap = Stap * (arg.alpha / ((real)(2. * (3. - 1.))));
+      Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1))));
       setIdentity(&I);
 
       TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap * conj(U);
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 2a512e77e3..f9072a4ee8 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -28,14 +28,16 @@ namespace quda
     const Float rho;
     const Float staple_coeff;
     const Float rectangle_coeff;
+    const int dir_ignore;
 
-    STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon = 0) :
+    STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon, int dir_ignore) :
       kernel_param(dim3(1, 2, stoutDim)),
       out(out),
       in(in),
       rho(rho),
       staple_coeff(rho * (5.0 - 2.0 * epsilon) / 3.0),
-      rectangle_coeff(rho * (1.0 - epsilon) / 12.0)
+      rectangle_coeff(rho * (1.0 - epsilon) / 12.0),
+      dir_ignore(dir_ignore)
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
@@ -67,11 +69,12 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       Link U, Stap, Q;
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
-      computeStaple(arg, x, X, parity, dir, Stap, Arg::stoutDim);
+      computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndex(x, X), parity);
@@ -133,6 +136,7 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       Link U, Q;
       ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
@@ -141,7 +145,7 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, Arg::stoutDim);
+      computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndex(x, X), parity);
diff --git a/include/quda.h b/include/quda.h
index d11fd4cbd0..ed586e1b54 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -848,6 +848,7 @@ extern "C" {
     double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
+    int dir_ignore; /**< The direction to be ignored by the smearing algorithm */
   } QudaGaugeSmearParam;
 
   typedef struct QudaBLASParam_s {
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index a2c8a92dd8..3f51ef566a 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -8,21 +8,26 @@ namespace quda {
 
   template <typename Float, int nColor, QudaReconstructType recon> class GaugeAPE : TunableKernel3D
   {
-    static constexpr int apeDim = 3; // apply APE in space only
     GaugeField &out;
     const GaugeField &in;
     const Float alpha;
+    const int dir_ignore;
+    const int apeDim;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
     unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
-    GaugeAPE(GaugeField &out, const GaugeField &in, double alpha) :
-      TunableKernel3D(in, 2, apeDim),
+    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       in(in),
-      alpha(static_cast<Float>(alpha))
+      alpha(static_cast<Float>(alpha)),
+      dir_ignore(dir_ignore),
+      apeDim((dir_ignore == 4) ? 4 : 3)
     {
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
       strcat(aux, comm_dim_partitioned_string());
       apply(device::get_default_stream());
     }
@@ -30,7 +35,11 @@ namespace quda {
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      launch<APE>(tp, stream, GaugeAPEArg<Float,nColor,recon, apeDim>(out, in, alpha));
+      if (apeDim == 3) {
+        launch<APE>(tp, stream, GaugeAPEArg<Float, nColor, recon, 3>(out, in, alpha, dir_ignore));
+      } else if (apeDim == 4) {
+        launch<APE>(tp, stream, GaugeAPEArg<Float, nColor, recon, 4>(out, in, alpha, dir_ignore));
+      }
     }
 
     void preTune() { out.backup(); } // defensive measure in case they alias
@@ -50,16 +59,18 @@ namespace quda {
 
   }; // GaugeAPE
 
-  void APEStep(GaugeField &out, GaugeField& in, double alpha)
+  void APEStep(GaugeField &out, GaugeField& in, double alpha, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeAPE>(out, in, alpha);
+    instantiate<GaugeAPE>(out, in, alpha, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 48d7e638e8..2a234d861b 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -12,6 +12,7 @@ namespace quda {
     const bool improved;
     const Float rho;
     const Float epsilon;
+    const int dir_ignore;
     const int stoutDim;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
 
@@ -24,17 +25,20 @@ namespace quda {
     }
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
-    GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon = 0.0) :
-      TunableKernel3D(in, 2, improved ? 4 : 3),
+    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       in(in),
       improved(improved),
       rho(static_cast<Float>(rho)),
       epsilon(static_cast<Float>(epsilon)),
-      stoutDim(improved ? 4 : 3)
+      dir_ignore(dir_ignore),
+      stoutDim((dir_ignore == 4) ? 4 : 3)
     {
       if (improved) strcat(aux, ",improved");
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
       strcat(aux, comm_dim_partitioned_string());
       apply(device::get_default_stream());
     }
@@ -43,10 +47,18 @@ namespace quda {
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (!improved) {
-        launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho));
+        if (stoutDim == 3) {
+          launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho, 0.0, dir_ignore));
+        } else if (stoutDim == 4) {
+          launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, 0.0, dir_ignore));
+        }
       } else if (improved) {
         tp.set_max_shared_bytes = true;
-        launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, epsilon));
+        if (stoutDim == 3) {
+          launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho, epsilon, dir_ignore));
+        } else if (stoutDim == 4) {
+          launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, epsilon, dir_ignore));
+        }
       }
     }
 
@@ -71,30 +83,34 @@ namespace quda {
               out.Reconstruct() * out.Precision()) * stoutDim * in.LocalVolume();    }
   };
 
-  void STOUTStep(GaugeField &out, GaugeField &in, double rho)
+  void STOUTStep(GaugeField &out, GaugeField &in, double rho, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeSTOUT>(out, in, false, rho);
+    instantiate<GaugeSTOUT>(out, in, false, rho, 0.0, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
-  void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon)
+  void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeSTOUT>(out, in, true, rho, epsilon);
+    instantiate<GaugeSTOUT>(out, in, true, rho, epsilon, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index b07a0b2c7b..4d655945ab 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5090,10 +5090,10 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     switch (smear_param->smear_type) {
-    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break;
-    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break;
+    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, smear_param->dir_ignore); break;
+    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->dir_ignore); break;
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
-      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon);
+      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }

From 4aaa49a9ece59af928fa406de946aec0d3fc896e Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Mon, 8 Jan 2024 20:46:21 +0800
Subject: [PATCH 02/22] Add HYP smearing.

---
 include/enum_quda.h           |   1 +
 include/enum_quda_fortran.h   |   5 +-
 include/gauge_tools.h         |  12 +
 include/kernels/gauge_hyp.cuh | 445 ++++++++++++++++++++++++++++++++++
 include/quda.h                |   3 +
 lib/CMakeLists.txt            |   2 +-
 lib/check_params.h            |   8 +
 lib/gauge_hyp.cu              | 107 ++++++++
 lib/interface_quda.cpp        |  21 ++
 tests/su3_test.cpp            |  26 +-
 tests/utils/misc.cpp          |   1 +
 11 files changed, 626 insertions(+), 5 deletions(-)
 create mode 100644 include/kernels/gauge_hyp.cuh
 create mode 100644 lib/gauge_hyp.cu

diff --git a/include/enum_quda.h b/include/enum_quda.h
index c4cbb59901..60c186eb77 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -571,6 +571,7 @@ typedef enum QudaGaugeSmearType_s {
   QUDA_GAUGE_SMEAR_APE,
   QUDA_GAUGE_SMEAR_STOUT,
   QUDA_GAUGE_SMEAR_OVRIMP_STOUT,
+  QUDA_GAUGE_SMEAR_HYP,
   QUDA_GAUGE_SMEAR_WILSON_FLOW,
   QUDA_GAUGE_SMEAR_SYMANZIK_FLOW,
   QUDA_GAUGE_SMEAR_INVALID = QUDA_INVALID_ENUM
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 6a8708948a..ada17e2cc8 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -504,8 +504,9 @@
 #define QUDA_GAUGE_SMEAR_APE 0
 #define QUDA_GAUGE_SMEAR_STOUT 1
 #define QUDA_GAUGE_SMEAR_OVRIMP_STOUT 2
-#define QUDA_GAUGE_SMEAR_WILSON_FLOW 3
-#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 4
+#define QUDA_GAUGE_SMEAR_HYP 3
+#define QUDA_GAUGE_SMEAR_WILSON_FLOW 4
+#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 5
 #define QUDA_GAUGE_SMEAR_INVALID QUDA_INVALID_ENUM
 
 #define QudaFermionSmearType integer(4)
diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 80d3a4f12b..46a84fed83 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -120,6 +120,18 @@ namespace quda
   */
   void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore);
 
+  /**
+     @brief Apply HYP smearing to the gauge field
+     @param[out] dataDs Output smeared field
+     @param[in] dataTemp Temp space
+     @param[in] dataOr Input gauge field
+     @param[in] alpha1 smearing parameter
+     @param[in] alpha2 smearing parameter
+     @param[in] alpha3 smearing parameter
+     @param[in] dir_ignore ignored direction
+  */
+  void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore);
+
   /**
      @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field.
      This routine assumes that the input and output fields are
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
new file mode 100644
index 0000000000..6184a31fa9
--- /dev/null
+++ b/include/kernels/gauge_hyp.cuh
@@ -0,0 +1,445 @@
+#include <gauge_field_order.h>
+#include <index_helper.cuh>
+#include <quda_matrix.h>
+#include <su3_project.cuh>
+#include <kernel.h>
+#include <kernels/gauge_utils.cuh>
+
+namespace quda
+{
+
+#define  DOUBLE_TOL	1e-15
+#define  SINGLE_TOL	2e-6
+
+  template <typename Float_, int nColor_, QudaReconstructType recon_, int level_, int hypDim_>
+  struct GaugeHYPArg : kernel_param<> {
+    using Float = Float_;
+    static constexpr int nColor = nColor_;
+    static_assert(nColor == 3, "Only nColor=3 enabled at this time");
+    static constexpr QudaReconstructType recon = recon_;
+    static constexpr int hypDim = hypDim_;
+    static constexpr int level = level_;
+    typedef typename gauge_mapper<Float,recon>::type Gauge;
+
+    Gauge out;
+    Gauge tmp[4];
+    const Gauge in;
+
+    int X[4];    // grid dimensions
+    int border[4];
+    const Float alpha;
+    const int dir_ignore;
+    const Float tolerance;
+
+    GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
+      kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)),
+      out(out),
+      tmp{*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
+      in(in),
+      alpha(alpha),
+      dir_ignore(dir_ignore),
+      tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL)
+    {
+      for (int dir = 0; dir < 4; ++dir) {
+        border[dir] = in.R()[dir];
+        X[dir] = in.X()[dir] - border[dir] * 2;
+      }
+    }
+  };
+
+  template <typename Arg, typename Staple, typename Int>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
+  {
+    using Link = typename get_type<Staple>::type;
+    for (int i = 0; i < 3; ++i) staple[i] = Link();
+
+    thread_array<int, 4> dx = { };
+    int cnt = -1;
+#pragma unroll
+    for (int nu = 0; nu < 4; ++nu) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
+      if (nu == mu) continue;
+
+      cnt += 1;
+
+      {
+        // Get link U_{\nu}(x)
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+
+        // Get link U_{\mu}(x+\nu)
+        dx[nu]++;
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[nu]--;
+
+        // Get link U_{\nu}(x+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[mu]--;
+
+        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
+        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+      }
+
+      {
+        // Get link U_{\nu}(x-\nu)
+        dx[nu]--;
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        // Get link U_{\mu}(x-\nu)
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+
+        // Get link U_{\nu}(x-\nu+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+
+        // reset dx
+        dx[mu]--;
+        dx[nu]++;
+
+        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
+        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+      }
+    }
+  }
+
+  template <typename Arg, typename Staple, typename Int>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
+  {
+    using Link = typename get_type<Staple>::type;
+    for (int i = 0; i < 3; ++i) staple[i] = Link();
+
+    thread_array<int, 4> dx = { };
+    int cnt = -1;
+#pragma unroll
+    for (int nu = 0; nu < 4; nu++) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
+      if (nu == mu) continue;
+
+      cnt += 1;
+
+      for (int rho = 0; rho < 4; ++rho) {
+        if (rho == mu || rho == nu) continue;
+        int sigma = 0;
+        while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
+
+        const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
+        const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
+
+        {
+          // Get link U_{\rho}(x)
+          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity);
+
+          // Get link U_{\mu}(x+\rho)
+          dx[rho]++;
+          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          dx[rho]--;
+
+          // Get link U_{\rho}(x+\mu)
+          dx[mu]++;
+          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity);
+          dx[mu]--;
+
+          // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu)
+          staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+        }
+
+        {
+          // Get link U_{\rho}(x-\rho)
+          dx[rho]--;
+          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity);
+          // Get link U_{\mu}(x-\rho)
+          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+
+          // Get link U_{\rho}(x-\rho+\mu)
+          dx[mu]++;
+          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity);
+
+          // reset dx
+          dx[mu]--;
+          dx[rho]++;
+
+          // staple += U^\dag_{\rho}(x-\rho) * U_{\mu}(x-\rho) * U_{\rho}(x-\rho+\mu)
+          staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+        }
+      }
+    }
+  }
+
+  template <typename Arg, typename Staple, typename Int>
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple &staple)
+  {
+    using Link = typename get_type<Staple>::type;
+    staple = Link();
+
+    thread_array<int, 4> dx = { };
+#pragma unroll
+    for (int nu = 0; nu < 4 ; nu++) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
+      if (nu == mu) continue;
+
+      const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
+      const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
+
+      {
+        // Get link U_{\nu}(x)
+        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity);
+
+        // Get link U_{\mu}(x+\nu)
+        dx[nu]++;
+        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[nu]--;
+
+        // Get link U_{\nu}(x+\mu)
+        dx[mu]++;
+        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[mu]--;
+
+        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
+        staple = staple + U1 * U2 * conj(U3);
+      }
+
+      {
+        // Get link U_{\nu}(x-\nu)
+        dx[nu]--;
+        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+        // Get link U_{\mu}(x-\nu)
+        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+
+        // Get link U_{\nu}(x-\nu+\mu)
+        dx[mu]++;
+        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity);
+
+        // reset dx
+        dx[mu]--;
+        dx[nu]++;
+
+        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
+        staple = staple + conj(U1) * U2 * U3;
+      }
+    }
+  }
+
+  template <typename Arg> struct HYP {
+    const Arg &arg;
+    constexpr HYP(const Arg &arg) : arg(arg) {}
+    static constexpr const char* filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
+    {
+      using real = typename Arg::Float;
+      typedef Matrix<complex<real>, Arg::nColor> Link;
+
+      // compute spacetime and local coords
+      int X[4];
+      for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr];
+      int x[4];
+      getCoords(x, x_cb, X, parity);
+      for (int dr = 0; dr < 4; ++dr) {
+        x[dr] += arg.border[dr];
+        X[dr] += 2 * arg.border[dr];
+      }
+
+      int dx[4] = {0, 0, 0, 0};
+      Link U, Stap[3], TestU, I;
+
+      // Get link U
+      U = arg.in(dir, linkIndexShift(x, dx, X), parity);
+      setIdentity(&I);
+
+      if constexpr (Arg::level == 1) {
+        computeStapleLevel1(arg, x, X, parity, dir, Stap);
+
+        for (int i = 0; i < 3; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 2) {
+        computeStapleLevel2(arg, x, X, parity, dir, Stap);
+
+        for (int i = 0; i < 3; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 3) {
+        computeStapleLevel3(arg, x, X, parity, dir, Stap[0]);
+
+        TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
+        polarSu3<real>(TestU, arg.tolerance);
+        arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U;
+      }
+    }
+  };
+
+  template <typename Arg, typename Staple, typename Int>
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], const int dir_ignore)
+  {
+    using Link = typename get_type<Staple>::type;
+    for (int i = 0; i < 2; ++i) staple[i] = Link();
+
+    thread_array<int, 4> dx = { };
+    int cnt = -1;
+#pragma unroll
+    for (int nu = 0; nu < 4; ++nu) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
+      if (nu == mu || nu == dir_ignore) continue;
+
+      cnt += 1;
+
+      {
+        // Get link U_{\nu}(x)
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+
+        // Get link U_{\mu}(x+\nu)
+        dx[nu]++;
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[nu]--;
+
+        // Get link U_{\nu}(x+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        dx[mu]--;
+
+        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
+        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+      }
+
+      {
+        // Get link U_{\nu}(x-\nu)
+        dx[nu]--;
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        // Get link U_{\mu}(x-\nu)
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+
+        // Get link U_{\nu}(x-\nu+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+
+        // reset dx
+        dx[mu]--;
+        dx[nu]++;
+
+        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
+        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+      }
+    }
+  }
+
+  template <typename Arg, typename Staple, typename Int>
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore)
+  {
+    using Link = typename get_type<Staple>::type;
+    for (int i = 0; i < 1; ++i) staple[i] = Link();
+
+    thread_array<int, 4> dx = { };
+    int cnt = -1;
+#pragma unroll
+    for (int nu = 0; nu < 4; nu++) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
+      if (nu == mu || nu == dir_ignore) continue;
+
+      cnt += 1;
+
+      for (int rho = 0; rho < 4; ++rho) {
+        if (rho == mu || rho == nu || rho == dir_ignore) continue;
+
+        const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
+        const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
+
+        {
+          // Get link U_{\nu}(x)
+          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity);
+
+          // Get link U_{\mu}(x+\nu)
+          dx[nu]++;
+          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          dx[nu]--;
+
+          // Get link U_{\nu}(x+\mu)
+          dx[mu]++;
+          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+          dx[mu]--;
+
+          // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
+          staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+        }
+
+        {
+          // Get link U_{\nu}(x-\nu)
+          dx[nu]--;
+          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+          // Get link U_{\mu}(x-\nu)
+          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+
+          // Get link U_{\nu}(x-\nu+\mu)
+          dx[mu]++;
+          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity);
+
+          // reset dx
+          dx[mu]--;
+          dx[rho]++;
+
+          // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
+          staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+        }
+      }
+    }
+  }
+
+  template <typename Arg> struct HYP3D {
+    const Arg &arg;
+    constexpr HYP3D(const Arg &arg) : arg(arg) {}
+    static constexpr const char* filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
+    {
+      using real = typename Arg::Float;
+      typedef Matrix<complex<real>, Arg::nColor> Link;
+
+      // compute spacetime and local coords
+      int X[4];
+      for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr];
+      int x[4];
+      getCoords(x, x_cb, X, parity);
+      for (int dr = 0; dr < 4; ++dr) {
+        x[dr] += arg.border[dr];
+        X[dr] += 2 * arg.border[dr];
+      }
+      int dir_ = dir;
+      dir = dir + (dir >= arg.dir_ignore);
+
+      int dx[4] = {0, 0, 0, 0};
+      Link U, Stap[2], TestU, I;
+
+      // Get link U
+      U = arg.in(dir, linkIndexShift(x, dx, X), parity);
+      setIdentity(&I);
+
+      if constexpr (Arg::level == 1) {
+        computeStaple3DLevel1(arg, x, X, parity, dir, Stap, arg.dir_ignore);
+
+        for (int i = 0; i < 2; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 2) {
+        computeStaple3DLevel2(arg, x, X, parity, dir, Stap, arg.dir_ignore);
+
+        TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.));
+        polarSu3<real>(TestU, arg.tolerance);
+        arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U;
+      }
+    }
+  };
+} // namespace quda
\ No newline at end of file
diff --git a/include/quda.h b/include/quda.h
index ed586e1b54..ccef6e3d80 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -846,6 +846,9 @@ extern "C" {
                              Wilson/Symanzik flow */
     double alpha;         /**< The single coefficient used in APE smearing */
     double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */
+    double alpha1;        /**< The first coefficient used in HYP smearing */
+    double alpha2;        /**< The second coefficient used in HYP smearing */
+    double alpha3;        /**< The third coefficient used in HYP smearing */
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
     int dir_ignore; /**< The direction to be ignored by the smearing algorithm */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d8c1d8342b..e9b7945ea7 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -27,7 +27,7 @@ set (QUDA_OBJS
   gauge_phase.cu timer.cpp
   solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp
   inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu
-  gauge_stout.cu gauge_wilson_flow.cu gauge_plaq.cu
+  gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu
   gauge_laplace.cpp gauge_observable.cpp
   inv_cg3_quda.cpp inv_ca_gcr.cpp inv_ca_cg.cpp
   inv_gcr_quda.cpp inv_mr_quda.cpp inv_sd_quda.cpp
diff --git a/lib/check_params.h b/lib/check_params.h
index 00892741f2..06247d0306 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -1100,12 +1100,20 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(alpha, 0.0);
   P(rho, 0.0);
   P(epsilon, 0.0);
+  P(alpha1, 0.0);
+  P(alpha2, 0.0);
+  P(alpha3, 0.0);
+  P(dir_ignore, 4);
 #else
   P(n_steps, (unsigned int)INVALID_INT);
   P(meas_interval, (unsigned int)INVALID_INT);
   P(alpha, INVALID_DOUBLE);
   P(rho, INVALID_DOUBLE);
   P(epsilon, INVALID_DOUBLE);
+  P(alpha1, INVALID_DOUBLE);
+  P(alpha2, INVALID_DOUBLE);
+  P(alpha3, INVALID_DOUBLE);
+  P(dir_ignore, INVALID_INT);
 #endif
 
 #ifdef INIT_PARAM
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
new file mode 100644
index 0000000000..7c587a97e1
--- /dev/null
+++ b/lib/gauge_hyp.cu
@@ -0,0 +1,107 @@
+#include <quda_internal.h>
+#include <gauge_field.h>
+#include <tunable_nd.h>
+#include <instantiate.h>
+#include <kernels/gauge_hyp.cuh>
+
+namespace quda {
+
+  template <typename Float, int nColor, QudaReconstructType recon> class GaugeHYP : TunableKernel3D
+  {
+    GaugeField &out;
+    GaugeField *tmp[4];
+    const GaugeField &in;
+    const Float alpha;
+    const int level;
+    const int dir_ignore;
+    const int hypDim;
+    unsigned int minThreads() const { return in.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
+
+  public:
+    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
+      out(out),
+      tmp{tmp[0], tmp[1], tmp[2], tmp[3]},
+      in(in),
+      alpha(static_cast<Float>(alpha)),
+      level(level),
+      dir_ignore(dir_ignore),
+      hypDim((dir_ignore == 4) ? 4 : 3)
+    {
+      strcat(aux, ",level=");
+      i32toa(aux + strlen(aux), level);
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
+      strcat(aux, comm_dim_partitioned_string());
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (hypDim == 4) {
+        if (level == 1) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,1,4>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 2) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,2,4>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 3) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,3,4>(out, tmp, in, alpha, dir_ignore));
+        }
+      } else if (hypDim == 3) {
+        if (level == 1) {
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float,nColor,recon,1,3>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 2) {
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float,nColor,recon,2,3>(out, tmp, in, alpha, dir_ignore));
+        }
+      }
+    }
+
+    void preTune() { out.backup(); } // defensive measure in case they alias
+    void postTune() { out.restore(); }
+
+    long long flops() const
+    {
+      auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll);
+      return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+    }
+
+    long long bytes() const // 6 links per dim, 1 in, 1 out.
+    {
+      return ((1 + (hypDim - 1) * 6) * in.Reconstruct() * in.Precision() +
+              out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume();
+    }
+
+  }; // GaugeAPE
+
+  void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField& in, double alpha1, double alpha2, double alpha3, int dir_ignore)
+  {
+    checkPrecision(out, in);
+    checkReconstruct(out, in);
+    checkNative(out, in);
+
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
+    if (dir_ignore == 4) {
+      copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
+      in.exchangeExtendedGhost(in.R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
+      tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false);
+      tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha1, 3, dir_ignore);
+      out.exchangeExtendedGhost(out.R(), false);
+    } else {
+      copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
+      in.exchangeExtendedGhost(in.R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      out.exchangeExtendedGhost(out.R(), false);
+    }
+  }
+
+}
\ No newline at end of file
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 4d655945ab..cc8ffa6254 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5084,6 +5084,20 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   GaugeField tmp(gParam);
 
+  const int smearDim = (smear_param->dir_ignore >= 0 && smear_param->dir_ignore <= 3) ? 3 : 4;
+  GaugeField *aux[4];
+  if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) {
+    gParam.geometry = QUDA_TENSOR_GEOMETRY;
+    if (smearDim == 3) {
+      aux[0] = new GaugeField(gParam);
+      // aux[1], aux[2] and aux[3] will not be used for smearDim=3
+      gParam.create = QUDA_REFERENCE_FIELD_CREATE;
+      for (int i = 1; i < 4; ++i) { aux[i] = new GaugeField(gParam); }
+    } else {
+      for (int i = 0; i < 4; ++i) { aux[i] = new GaugeField(gParam); }
+    }
+  }
+
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
   logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
@@ -5095,6 +5109,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
       OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
       break;
+    case QUDA_GAUGE_SMEAR_HYP:
+      HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore);
+      break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
 
@@ -5105,6 +5122,10 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     }
   }
 
+  if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) {
+    for (int i = 0; i < 4; ++i) { delete aux[i]; }
+  }
+
   popOutputPrefix();
 }
 
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index a03726f3fb..3adc58e6d7 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -22,8 +22,12 @@
 double gauge_smear_rho = 0.1;
 double gauge_smear_epsilon = 0.1;
 double gauge_smear_alpha = 0.6;
+double gauge_smear_alpha1 = 0.75;
+double gauge_smear_alpha2 = 0.6;
+double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 50;
 QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
+int gauge_smear_dir_ignore = 4;
 int measurement_interval = 5;
 bool su_project = true;
 
@@ -45,11 +49,17 @@ void display_test_info()
     printfQuda(" - rho %f\n", gauge_smear_rho);
     printfQuda(" - epsilon %f\n", gauge_smear_epsilon);
     break;
+  case QUDA_GAUGE_SMEAR_HYP:
+    printfQuda(" - alpha1 %f\n", gauge_smear_alpha1);
+    printfQuda(" - alpha2 %f\n", gauge_smear_alpha2);
+    printfQuda(" - alpha3 %f\n", gauge_smear_alpha3);
+    break;
   case QUDA_GAUGE_SMEAR_WILSON_FLOW:
   case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break;
   default: errorQuda("Undefined test type %d given", test_type);
   }
   printfQuda(" - smearing steps %d\n", gauge_smear_steps);
+  printfQuda(" - smearing ignore direction %d\n", gauge_smear_dir_ignore);
   printfQuda(" - Measurement interval %d\n", measurement_interval);
 
   printfQuda("Grid partition info:     X  Y  Z  T\n");
@@ -63,6 +73,7 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
                                                                 {"stout", QUDA_GAUGE_SMEAR_STOUT},
                                                                 {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
+                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
                                                                 {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
                                                                 {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
 
@@ -72,7 +83,7 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup
     ->add_option(
       "--su3-smear-type",
-      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, Wilson Flow, Symanzik Flow (default stout)")
+      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
     ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
   ;
   opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
@@ -83,6 +94,12 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon,
                       "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)");
 
+  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
+  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
+  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
+
+  opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, "direction to be ignored by the smearing (default 4)");
+
   opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");
 
   opgroup->add_option("--su3-measurement-interval", measurement_interval,
@@ -251,12 +268,17 @@ int main(int argc, char **argv)
   smear_param.alpha = gauge_smear_alpha;
   smear_param.rho = gauge_smear_rho;
   smear_param.epsilon = gauge_smear_epsilon;
+  smear_param.alpha1 = gauge_smear_alpha1;
+  smear_param.alpha2 = gauge_smear_alpha2;
+  smear_param.alpha3 = gauge_smear_alpha3;
+  smear_param.dir_ignore = gauge_smear_dir_ignore;
 
   host_timer.start(); // start the timer
   switch (smear_param.smear_type) {
   case QUDA_GAUGE_SMEAR_APE:
   case QUDA_GAUGE_SMEAR_STOUT:
-  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: {
+  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+  case QUDA_GAUGE_SMEAR_HYP: {
     performGaugeSmearQuda(&smear_param, obs_param);
     break;
   }
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index fd920e5e16..b213ee6f1e 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -189,6 +189,7 @@ const char *get_gauge_smear_str(QudaGaugeSmearType type)
   case QUDA_GAUGE_SMEAR_APE: ret = "APE"; break;
   case QUDA_GAUGE_SMEAR_STOUT: ret = "Stout"; break;
   case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over Improved"; break;
+  case QUDA_GAUGE_SMEAR_HYP: ret = "HYP"; break;
   case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson Flow"; break;
   case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik Flow"; break;
   default: ret = "unknown"; break;

From 2089093d74d3ed8ee18c88765cee88036c8693d1 Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Mon, 8 Jan 2024 22:33:00 +0800
Subject: [PATCH 03/22] Update `flops()` and `bytes()` for HYP smearing. Fix
 bugs in 3D HYP smearing.

---
 include/kernels/gauge_hyp.cuh | 20 +++++++++-----------
 lib/gauge_hyp.cu              | 28 ++++++++++++++++++++++++----
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 6184a31fa9..19f456304a 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -169,10 +169,10 @@ namespace quda
   }
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple &staple)
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
-    staple = Link();
+    staple[0] = Link();
 
     thread_array<int, 4> dx = { };
 #pragma unroll
@@ -200,7 +200,7 @@ namespace quda
         dx[mu]--;
 
         // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-        staple = staple + U1 * U2 * conj(U3);
+        staple[0] = staple[0] + U1 * U2 * conj(U3);
       }
 
       {
@@ -219,7 +219,7 @@ namespace quda
         dx[nu]++;
 
         // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-        staple = staple + conj(U1) * U2 * U3;
+        staple[0] = staple[0] + conj(U1) * U2 * U3;
       }
     }
   }
@@ -268,7 +268,7 @@ namespace quda
           arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 3) {
-        computeStapleLevel3(arg, x, X, parity, dir, Stap[0]);
+        computeStapleLevel3(arg, x, X, parity, dir, Stap);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
         polarSu3<real>(TestU, arg.tolerance);
@@ -337,7 +337,7 @@ namespace quda
   __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
-    for (int i = 0; i < 1; ++i) staple[i] = Link();
+    staple[0] = Link();
 
     thread_array<int, 4> dx = { };
     int cnt = -1;
@@ -348,8 +348,6 @@ namespace quda
       // when used with STOUT or APE for measurement smearing)
       if (nu == mu || nu == dir_ignore) continue;
 
-      cnt += 1;
-
       for (int rho = 0; rho < 4; ++rho) {
         if (rho == mu || rho == nu || rho == dir_ignore) continue;
 
@@ -371,7 +369,7 @@ namespace quda
           dx[mu]--;
 
           // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-          staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+          staple[0] = staple[0] + U1 * U2 * conj(U3);
         }
 
         {
@@ -387,10 +385,10 @@ namespace quda
 
           // reset dx
           dx[mu]--;
-          dx[rho]++;
+          dx[nu]++;
 
           // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-          staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+          staple[0] = staple[0] + conj(U1) * U2 * U3;
         }
       }
     }
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 7c587a97e1..b4860cd701 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -64,13 +64,33 @@ namespace quda {
     long long flops() const
     {
       auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll);
-      return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+      if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) {
+        return ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+      } else if (hypDim == 4 && level == 2) {
+        return ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume();
+      } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) {
+        return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+      }
     }
 
-    long long bytes() const // 6 links per dim, 1 in, 1 out.
+    long long bytes() const
     {
-      return ((1 + (hypDim - 1) * 6) * in.Reconstruct() * in.Precision() +
-              out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume();
+      if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp
+        return (in.Reconstruct() * in.Precision()
+                + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
+                + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+          * hypDim * in.LocalVolume();
+      } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp
+        return (in.Reconstruct() * in.Precision()
+                + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+          * hypDim * in.LocalVolume();
+      } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out
+        return (in.Reconstruct() * in.Precision()
+                + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                + out.Reconstruct() * out.Precision())
+          * hypDim * in.LocalVolume();
+      }
     }
 
   }; // GaugeAPE

From 4d073f75d27ff03ae1b7567c0cec49a8e38dce1b Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Wed, 10 Jan 2024 12:55:07 +0800
Subject: [PATCH 04/22] Make compiler happy with STRICT build.

---
 include/kernels/gauge_hyp.cuh |  1 -
 lib/gauge_hyp.cu              | 28 ++++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 19f456304a..e30091a02c 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -340,7 +340,6 @@ namespace quda
     staple[0] = Link();
 
     thread_array<int, 4> dx = { };
-    int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index b4860cd701..84281c0b1a 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -63,34 +63,38 @@ namespace quda {
 
     long long flops() const
     {
+      long long flops = 0;
       auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll);
       if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) {
-        return ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+        flops += ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
       } else if (hypDim == 4 && level == 2) {
-        return ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume();
+        flops += ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume();
       } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) {
-        return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+        flops += (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
       }
+      return flops;
     }
 
     long long bytes() const
     {
+      long long bytes = 0;
       if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp
-        return (in.Reconstruct() * in.Precision()
-                + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
-                + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+        bytes += (in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
           * hypDim * in.LocalVolume();
       } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp
-        return (in.Reconstruct() * in.Precision()
-                + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
-                + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+        bytes += (in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                  + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
           * hypDim * in.LocalVolume();
       } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out
-        return (in.Reconstruct() * in.Precision()
-                + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
-                + out.Reconstruct() * out.Precision())
+        bytes += (in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                  + out.Reconstruct() * out.Precision())
           * hypDim * in.LocalVolume();
       }
+      return bytes;
     }
 
   }; // GaugeAPE

From 0fc4cb4123531c1b2d40d6cd42a61803ca47f44a Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Wed, 10 Jan 2024 13:06:14 +0800
Subject: [PATCH 05/22] Set default tolerance of SU3 projection 10x epsilon.
 This solves #1425.

---
 include/kernels/gauge_ape.cuh | 4 ++--
 include/kernels/gauge_hyp.cuh | 4 ++--
 lib/gauge_observable.cpp      | 2 +-
 lib/interface_quda.cpp        | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index 226f426f73..7bee11651e 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -8,8 +8,8 @@
 namespace quda
 {
 
-#define  DOUBLE_TOL	1e-15
-#define  SINGLE_TOL	2e-6
+#define  DOUBLE_TOL	2e-15
+#define  SINGLE_TOL	1e-6
 
   template <typename Float_, int nColor_, QudaReconstructType recon_, int apeDim_>
   struct GaugeAPEArg : kernel_param<> {
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index e30091a02c..886b2c688f 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -8,8 +8,8 @@
 namespace quda
 {
 
-#define  DOUBLE_TOL	1e-15
-#define  SINGLE_TOL	2e-6
+#define  DOUBLE_TOL	2e-15
+#define  SINGLE_TOL	1e-6
 
   template <typename Float_, int nColor_, QudaReconstructType recon_, int level_, int hypDim_>
   struct GaugeHYPArg : kernel_param<> {
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index 041dc6164d..5994cf155f 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -12,7 +12,7 @@ namespace quda
       int *num_failures_h = static_cast<int *>(pool_pinned_malloc(sizeof(int)));
       int *num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
       *num_failures_h = 0;
-      auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 1e-14 : 1e-6;
+      auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6;
       projectSU3(u, tol, num_failures_d);
       if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
       pool_pinned_free(num_failures_h);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index cc8ffa6254..6555314dc7 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3746,7 +3746,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     // project onto SU(3) if using the Chroma convention
     if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) {
       *num_failures_h = 0;
-      const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6;
+      const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6;
       if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase();
       projectSU3(unitarizedLink, tol, num_failures_d);
       if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied)

From e240272118d7281e27a2abc95f212520b89474e7 Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Wed, 10 Jan 2024 13:07:16 +0800
Subject: [PATCH 06/22] clang-format

---
 include/gauge_tools.h         |  3 ++-
 include/kernels/gauge_ape.cuh |  6 ++---
 include/kernels/gauge_hyp.cuh | 45 +++++++++++++++++++----------------
 include/quda.h                |  8 +++----
 lib/gauge_ape.cu              |  2 +-
 lib/gauge_hyp.cu              | 26 ++++++++++----------
 lib/gauge_stout.cu            |  2 +-
 lib/interface_quda.cpp        |  3 ++-
 tests/su3_test.cpp            |  3 ++-
 9 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 46a84fed83..cfd3f1e8ba 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -130,7 +130,8 @@ namespace quda
      @param[in] alpha3 smearing parameter
      @param[in] dir_ignore ignored direction
   */
-  void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore);
+  void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2,
+               double alpha3, int dir_ignore);
 
   /**
      @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field.
diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index 7bee11651e..60b3b6a1a0 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -8,8 +8,8 @@
 namespace quda
 {
 
-#define  DOUBLE_TOL	2e-15
-#define  SINGLE_TOL	1e-6
+#define DOUBLE_TOL 2e-15
+#define SINGLE_TOL 1e-6
 
   template <typename Float_, int nColor_, QudaReconstructType recon_, int apeDim_>
   struct GaugeAPEArg : kernel_param<> {
@@ -72,7 +72,7 @@ namespace quda
 
       // Get link U
       U = arg.in(dir, linkIndexShift(x, dx, X), parity);
-    
+
       Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1))));
       setIdentity(&I);
 
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 886b2c688f..6988908776 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -8,8 +8,8 @@
 namespace quda
 {
 
-#define  DOUBLE_TOL	2e-15
-#define  SINGLE_TOL	1e-6
+#define DOUBLE_TOL 2e-15
+#define SINGLE_TOL 1e-6
 
   template <typename Float_, int nColor_, QudaReconstructType recon_, int level_, int hypDim_>
   struct GaugeHYPArg : kernel_param<> {
@@ -19,13 +19,13 @@ namespace quda
     static constexpr QudaReconstructType recon = recon_;
     static constexpr int hypDim = hypDim_;
     static constexpr int level = level_;
-    typedef typename gauge_mapper<Float,recon>::type Gauge;
+    typedef typename gauge_mapper<Float, recon>::type Gauge;
 
     Gauge out;
     Gauge tmp[4];
     const Gauge in;
 
-    int X[4];    // grid dimensions
+    int X[4]; // grid dimensions
     int border[4];
     const Float alpha;
     const int dir_ignore;
@@ -34,7 +34,7 @@ namespace quda
     GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
       kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)),
       out(out),
-      tmp{*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
+      tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
       in(in),
       alpha(alpha),
       dir_ignore(dir_ignore),
@@ -48,12 +48,13 @@ namespace quda
   };
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity,
+                                                      const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = { };
+    thread_array<int, 4> dx = {};
     int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
@@ -104,12 +105,13 @@ namespace quda
   }
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity,
+                                                      const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = { };
+    thread_array<int, 4> dx = {};
     int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
@@ -169,14 +171,15 @@ namespace quda
   }
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3])
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity,
+                                                      const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
     staple[0] = Link();
 
-    thread_array<int, 4> dx = { };
+    thread_array<int, 4> dx = {};
 #pragma unroll
-    for (int nu = 0; nu < 4 ; nu++) {
+    for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
@@ -226,8 +229,8 @@ namespace quda
 
   template <typename Arg> struct HYP {
     const Arg &arg;
-    constexpr HYP(const Arg &arg) : arg(arg) {}
-    static constexpr const char* filename() { return KERNEL_FILE; }
+    constexpr HYP(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
     {
@@ -278,12 +281,13 @@ namespace quda
   };
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], const int dir_ignore)
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity,
+                                                        const int mu, Staple staple[2], const int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
     for (int i = 0; i < 2; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = { };
+    thread_array<int, 4> dx = {};
     int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
@@ -334,12 +338,13 @@ namespace quda
   }
 
   template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore)
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity,
+                                                        const int mu, Staple staple[2], int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
     staple[0] = Link();
 
-    thread_array<int, 4> dx = { };
+    thread_array<int, 4> dx = {};
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
@@ -395,8 +400,8 @@ namespace quda
 
   template <typename Arg> struct HYP3D {
     const Arg &arg;
-    constexpr HYP3D(const Arg &arg) : arg(arg) {}
-    static constexpr const char* filename() { return KERNEL_FILE; }
+    constexpr HYP3D(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
 
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
     {
diff --git a/include/quda.h b/include/quda.h
index ccef6e3d80..8289c4e3d0 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -846,12 +846,12 @@ extern "C" {
                              Wilson/Symanzik flow */
     double alpha;         /**< The single coefficient used in APE smearing */
     double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */
-    double alpha1;        /**< The first coefficient used in HYP smearing */
-    double alpha2;        /**< The second coefficient used in HYP smearing */
-    double alpha3;        /**< The third coefficient used in HYP smearing */
+    double alpha1;                 /**< The first coefficient used in HYP smearing */
+    double alpha2;                 /**< The second coefficient used in HYP smearing */
+    double alpha3;                 /**< The third coefficient used in HYP smearing */
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
-    int dir_ignore; /**< The direction to be ignored by the smearing algorithm */
+    int dir_ignore;                /**< The direction to be ignored by the smearing algorithm */
   } QudaGaugeSmearParam;
 
   typedef struct QudaBLASParam_s {
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 3f51ef566a..8d61066041 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -59,7 +59,7 @@ namespace quda {
 
   }; // GaugeAPE
 
-  void APEStep(GaugeField &out, GaugeField& in, double alpha, int dir_ignore)
+  void APEStep(GaugeField &out, GaugeField &in, double alpha, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 84281c0b1a..331d1cfbf8 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -4,7 +4,8 @@
 #include <instantiate.h>
 #include <kernels/gauge_hyp.cuh>
 
-namespace quda {
+namespace quda
+{
 
   template <typename Float, int nColor, QudaReconstructType recon> class GaugeHYP : TunableKernel3D
   {
@@ -23,7 +24,7 @@ namespace quda {
     GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
-      tmp{tmp[0], tmp[1], tmp[2], tmp[3]},
+      tmp {tmp[0], tmp[1], tmp[2], tmp[3]},
       in(in),
       alpha(static_cast<Float>(alpha)),
       level(level),
@@ -43,17 +44,17 @@ namespace quda {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (hypDim == 4) {
         if (level == 1) {
-          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,1,4>(out, tmp, in, alpha, dir_ignore));
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 1, 4>(out, tmp, in, alpha, dir_ignore));
         } else if (level == 2) {
-          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,2,4>(out, tmp, in, alpha, dir_ignore));
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 2, 4>(out, tmp, in, alpha, dir_ignore));
         } else if (level == 3) {
-          launch<HYP>(tp, stream, GaugeHYPArg<Float,nColor,recon,3,4>(out, tmp, in, alpha, dir_ignore));
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 3, 4>(out, tmp, in, alpha, dir_ignore));
         }
       } else if (hypDim == 3) {
         if (level == 1) {
-          launch<HYP3D>(tp, stream, GaugeHYPArg<Float,nColor,recon,1,3>(out, tmp, in, alpha, dir_ignore));
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float, nColor, recon, 1, 3>(out, tmp, in, alpha, dir_ignore));
         } else if (level == 2) {
-          launch<HYP3D>(tp, stream, GaugeHYPArg<Float,nColor,recon,2,3>(out, tmp, in, alpha, dir_ignore));
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float, nColor, recon, 2, 3>(out, tmp, in, alpha, dir_ignore));
         }
       }
     }
@@ -79,8 +80,7 @@ namespace quda {
     {
       long long bytes = 0;
       if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp
-        bytes += (in.Reconstruct() * in.Precision()
-                  + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
+        bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
                   + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
           * hypDim * in.LocalVolume();
       } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp
@@ -89,8 +89,7 @@ namespace quda {
                   + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
           * hypDim * in.LocalVolume();
       } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out
-        bytes += (in.Reconstruct() * in.Precision()
-                  + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+        bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
                   + out.Reconstruct() * out.Precision())
           * hypDim * in.LocalVolume();
       }
@@ -99,7 +98,8 @@ namespace quda {
 
   }; // GaugeAPE
 
-  void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField& in, double alpha1, double alpha2, double alpha3, int dir_ignore)
+  void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField &in, double alpha1, double alpha2, double alpha3,
+               int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
@@ -128,4 +128,4 @@ namespace quda {
     }
   }
 
-}
\ No newline at end of file
+} // namespace quda
\ No newline at end of file
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 2a234d861b..2cf1c4e634 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -99,7 +99,7 @@ namespace quda {
     out.exchangeExtendedGhost(out.R(), false);
   }
 
-  void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon, int dir_ignore)
+  void OvrImpSTOUTStep(GaugeField &out, GaugeField &in, double rho, double epsilon, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 6555314dc7..bb144ba4df 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5110,7 +5110,8 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
       OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
       break;
     case QUDA_GAUGE_SMEAR_HYP:
-      HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore);
+      HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3,
+              smear_param->dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index 3adc58e6d7..acd2803b42 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -98,7 +98,8 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
   opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
 
-  opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, "direction to be ignored by the smearing (default 4)");
+  opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore,
+                      "direction to be ignored by the smearing (default 4)");
 
   opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");
 

From 2839225e7eda42b43ccb8dadc88f3f99c1673662 Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Wed, 10 Jan 2024 14:42:06 +0800
Subject: [PATCH 07/22] Use `GaugeField::toleranceSU3()` to get the default tol
 for SU(3) projection.

---
 include/gauge_field.h         | 13 +++++++++++++
 include/kernels/gauge_ape.cuh |  5 +----
 include/kernels/gauge_hyp.cuh |  5 +----
 lib/gauge_observable.cpp      |  2 +-
 lib/interface_quda.cpp        |  2 +-
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index ffea94c22b..a62b6a684f 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -638,6 +638,19 @@ namespace quda {
      */
     static bool are_compatible_weak(const GaugeField &a, const GaugeField &b);
 
+    /**
+     * @brief Helper function that returns the default tolerance used by SU(3) projection
+     * @return The default tolerance, which is ~10x epsilon
+     */
+    double toleranceSU3() const
+    {
+      switch (precision) {
+      case QUDA_DOUBLE_PRECISION: return 2e-15;
+      case QUDA_SINGLE_PRECISION: return 1e-6;
+      default: return 1e-6;
+      }
+    }
+
     friend struct GaugeFieldParam;
   };
 
diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index 60b3b6a1a0..f8e00d03db 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -8,9 +8,6 @@
 namespace quda
 {
 
-#define DOUBLE_TOL 2e-15
-#define SINGLE_TOL 1e-6
-
   template <typename Float_, int nColor_, QudaReconstructType recon_, int apeDim_>
   struct GaugeAPEArg : kernel_param<> {
     using Float = Float_;
@@ -35,7 +32,7 @@ namespace quda
       in(in),
       alpha(alpha),
       dir_ignore(dir_ignore),
-      tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL)
+      tolerance(in.toleranceSU3())
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 6988908776..a2a74863f6 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -8,9 +8,6 @@
 namespace quda
 {
 
-#define DOUBLE_TOL 2e-15
-#define SINGLE_TOL 1e-6
-
   template <typename Float_, int nColor_, QudaReconstructType recon_, int level_, int hypDim_>
   struct GaugeHYPArg : kernel_param<> {
     using Float = Float_;
@@ -38,7 +35,7 @@ namespace quda
       in(in),
       alpha(alpha),
       dir_ignore(dir_ignore),
-      tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL)
+      tolerance(in.toleranceSU3())
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index 5994cf155f..37f8061686 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -12,7 +12,7 @@ namespace quda
       int *num_failures_h = static_cast<int *>(pool_pinned_malloc(sizeof(int)));
       int *num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
       *num_failures_h = 0;
-      auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6;
+      auto tol = u.toleranceSU3();
       projectSU3(u, tol, num_failures_d);
       if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
       pool_pinned_free(num_failures_h);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index bb144ba4df..a01c75998a 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3746,7 +3746,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     // project onto SU(3) if using the Chroma convention
     if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) {
       *num_failures_h = 0;
-      const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6;
+      const double tol = unitarizedLink.toleranceSU3();
       if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase();
       projectSU3(unitarizedLink, tol, num_failures_d);
       if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied)

From 1133b3fccbec17194c52674b85086181b97ff85e Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 31 Jan 2024 21:06:18 -0800
Subject: [PATCH 08/22] Moved allocating temporaries into the HYPSmear routine

---
 include/gauge_tools.h         |  3 +--
 include/kernels/gauge_hyp.cuh |  2 +-
 lib/gauge_hyp.cu              | 27 ++++++++++++++++++++++++---
 lib/interface_quda.cpp        | 20 +-------------------
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index cfd3f1e8ba..4c782d0ddc 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -123,14 +123,13 @@ namespace quda
   /**
      @brief Apply HYP smearing to the gauge field
      @param[out] dataDs Output smeared field
-     @param[in] dataTemp Temp space
      @param[in] dataOr Input gauge field
      @param[in] alpha1 smearing parameter
      @param[in] alpha2 smearing parameter
      @param[in] alpha3 smearing parameter
      @param[in] dir_ignore ignored direction
   */
-  void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2,
+  void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2,
                double alpha3, int dir_ignore);
 
   /**
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index a2a74863f6..ec7e09e05b 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -28,7 +28,7 @@ namespace quda
     const int dir_ignore;
     const Float tolerance;
 
-    GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
+    GaugeHYPArg(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
       kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)),
       out(out),
       tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 331d1cfbf8..2090d19dd5 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -10,7 +10,7 @@ namespace quda
   template <typename Float, int nColor, QudaReconstructType recon> class GaugeHYP : TunableKernel3D
   {
     GaugeField &out;
-    GaugeField *tmp[4];
+    GaugeField* tmp[4];
     const GaugeField &in;
     const Float alpha;
     const int level;
@@ -21,7 +21,7 @@ namespace quda
 
   public:
     // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
-    GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
+    GaugeHYP(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       tmp {tmp[0], tmp[1], tmp[2], tmp[3]},
@@ -98,13 +98,31 @@ namespace quda
 
   }; // GaugeAPE
 
-  void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField &in, double alpha1, double alpha2, double alpha3,
+  void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3,
                int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    GaugeFieldParam gParam(out);
+    gParam.location = QUDA_CUDA_FIELD_LOCATION;
+    const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4;
+    //GaugeField tmp[4];
+    gParam.geometry = QUDA_TENSOR_GEOMETRY;
+    GaugeFieldParam gParam2(gParam);
+    gParam2.create = QUDA_REFERENCE_FIELD_CREATE;
+
+    GaugeField* tmp[4];
+    if (smearDim == 3) {
+      tmp[0] = new GaugeField(gParam);
+      // aux[1], aux[2] and aux[3] will not be used for smearDim == 3
+      gParam.create = QUDA_REFERENCE_FIELD_CREATE;
+      for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
+    } else {
+      for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
+    }
+
     if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
 
     if (dir_ignore == 4) {
@@ -126,6 +144,9 @@ namespace quda
       instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
       out.exchangeExtendedGhost(out.R(), false);
     }
+
+    for (int i = 0; i < 4; i++) delete tmp[i];
+
   }
 
 } // namespace quda
\ No newline at end of file
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index db137a705e..756d7c089a 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5082,20 +5082,6 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   GaugeField tmp(gParam);
 
-  const int smearDim = (smear_param->dir_ignore >= 0 && smear_param->dir_ignore <= 3) ? 3 : 4;
-  GaugeField *aux[4];
-  if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) {
-    gParam.geometry = QUDA_TENSOR_GEOMETRY;
-    if (smearDim == 3) {
-      aux[0] = new GaugeField(gParam);
-      // aux[1], aux[2] and aux[3] will not be used for smearDim=3
-      gParam.create = QUDA_REFERENCE_FIELD_CREATE;
-      for (int i = 1; i < 4; ++i) { aux[i] = new GaugeField(gParam); }
-    } else {
-      for (int i = 0; i < 4; ++i) { aux[i] = new GaugeField(gParam); }
-    }
-  }
-
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
   logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
@@ -5108,7 +5094,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
       OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
       break;
     case QUDA_GAUGE_SMEAR_HYP:
-      HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3,
+      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3,
               smear_param->dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
@@ -5121,10 +5107,6 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
     }
   }
 
-  if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) {
-    for (int i = 0; i < 4; ++i) { delete aux[i]; }
-  }
-
   popOutputPrefix();
 }
 

From 9545d28f83b272bf45d108f60fa025294cbf7dae Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 31 Jan 2024 21:37:58 -0800
Subject: [PATCH 09/22] Code cleanup, introducing extended field coordinate E

---
 include/kernels/gauge_hyp.cuh | 127 ++++++++++++++++------------------
 1 file changed, 61 insertions(+), 66 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index ec7e09e05b..8e116dcb65 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -22,6 +22,7 @@ namespace quda
     Gauge tmp[4];
     const Gauge in;
 
+    int E[4]; // extended grid dimensions
     int X[4]; // grid dimensions
     int border[4];
     const Float alpha;
@@ -39,13 +40,14 @@ namespace quda
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
+        E[dir] = in.X()[dir];
         X[dir] = in.X()[dir] - border[dir] * 2;
       }
     }
   };
 
-  template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity,
+  template <typename Arg, typename Staple>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const int parity,
                                                       const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
@@ -64,16 +66,16 @@ namespace quda
 
       {
         // Get link U_{\nu}(x)
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
 
         // Get link U_{\mu}(x+\nu)
         dx[nu]++;
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[nu]--;
 
         // Get link U_{\nu}(x+\mu)
         dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[mu]--;
 
         // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
@@ -83,13 +85,13 @@ namespace quda
       {
         // Get link U_{\nu}(x-\nu)
         dx[nu]--;
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
 
         // Get link U_{\nu}(x-\nu+\mu)
         dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
 
         // reset dx
         dx[mu]--;
@@ -101,8 +103,8 @@ namespace quda
     }
   }
 
-  template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity,
+  template <typename Arg, typename Staple>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const int parity,
                                                       const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
@@ -129,16 +131,16 @@ namespace quda
 
         {
           // Get link U_{\rho}(x)
-          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity);
+          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity);
 
           // Get link U_{\mu}(x+\rho)
           dx[rho]++;
-          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
           dx[rho]--;
 
           // Get link U_{\rho}(x+\mu)
           dx[mu]++;
-          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity);
+          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity);
           dx[mu]--;
 
           // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu)
@@ -148,13 +150,13 @@ namespace quda
         {
           // Get link U_{\rho}(x-\rho)
           dx[rho]--;
-          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity);
+          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity);
           // Get link U_{\mu}(x-\rho)
-          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
 
           // Get link U_{\rho}(x-\rho+\mu)
           dx[mu]++;
-          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity);
+          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity);
 
           // reset dx
           dx[mu]--;
@@ -167,8 +169,8 @@ namespace quda
     }
   }
 
-  template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity,
+  template <typename Arg, typename Staple>
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity,
                                                       const int mu, Staple staple[3])
   {
     using Link = typename get_type<Staple>::type;
@@ -187,16 +189,16 @@ namespace quda
 
       {
         // Get link U_{\nu}(x)
-        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity);
+        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity);
 
         // Get link U_{\mu}(x+\nu)
         dx[nu]++;
-        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[nu]--;
 
         // Get link U_{\nu}(x+\mu)
         dx[mu]++;
-        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[mu]--;
 
         // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
@@ -206,13 +208,13 @@ namespace quda
       {
         // Get link U_{\nu}(x-\nu)
         dx[nu]--;
-        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
 
         // Get link U_{\nu}(x-\nu+\mu)
         dx[mu]++;
-        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity);
+        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity);
 
         // reset dx
         dx[mu]--;
@@ -235,50 +237,46 @@ namespace quda
       typedef Matrix<complex<real>, Arg::nColor> Link;
 
       // compute spacetime and local coords
-      int X[4];
-      for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr];
       int x[4];
-      getCoords(x, x_cb, X, parity);
-      for (int dr = 0; dr < 4; ++dr) {
-        x[dr] += arg.border[dr];
-        X[dr] += 2 * arg.border[dr];
-      }
+      getCoords(x, x_cb, arg.E, parity);
+#pragma unroll
+      for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
       int dx[4] = {0, 0, 0, 0};
       Link U, Stap[3], TestU, I;
 
       // Get link U
-      U = arg.in(dir, linkIndexShift(x, dx, X), parity);
+      U = arg.in(dir, linkIndexShift(x, dx, arg.X), parity);
       setIdentity(&I);
 
       if constexpr (Arg::level == 1) {
-        computeStapleLevel1(arg, x, X, parity, dir, Stap);
+        computeStapleLevel1(arg, x, parity, dir, Stap);
 
         for (int i = 0; i < 3; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
           polarSu3<real>(TestU, arg.tolerance);
-          arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+          arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 2) {
-        computeStapleLevel2(arg, x, X, parity, dir, Stap);
+        computeStapleLevel2(arg, x, parity, dir, Stap);
 
         for (int i = 0; i < 3; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.));
           polarSu3<real>(TestU, arg.tolerance);
-          arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+          arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 3) {
-        computeStapleLevel3(arg, x, X, parity, dir, Stap);
+        computeStapleLevel3(arg, x, parity, dir, Stap);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
         polarSu3<real>(TestU, arg.tolerance);
-        arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U;
+        arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
       }
     }
   };
 
-  template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity,
+  template <typename Arg, typename Staple>
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const int parity,
                                                         const int mu, Staple staple[2], const int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
@@ -297,16 +295,16 @@ namespace quda
 
       {
         // Get link U_{\nu}(x)
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
 
         // Get link U_{\mu}(x+\nu)
         dx[nu]++;
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[nu]--;
 
         // Get link U_{\nu}(x+\mu)
         dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         dx[mu]--;
 
         // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
@@ -316,13 +314,13 @@ namespace quda
       {
         // Get link U_{\nu}(x-\nu)
         dx[nu]--;
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
         // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity);
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
 
         // Get link U_{\nu}(x-\nu+\mu)
         dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity);
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
 
         // reset dx
         dx[mu]--;
@@ -334,8 +332,8 @@ namespace quda
     }
   }
 
-  template <typename Arg, typename Staple, typename Int>
-  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity,
+  template <typename Arg, typename Staple>
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const int parity,
                                                         const int mu, Staple staple[2], int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
@@ -357,16 +355,16 @@ namespace quda
 
         {
           // Get link U_{\nu}(x)
-          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity);
+          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity);
 
           // Get link U_{\mu}(x+\nu)
           dx[nu]++;
-          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
           dx[nu]--;
 
           // Get link U_{\nu}(x+\mu)
           dx[mu]++;
-          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
           dx[mu]--;
 
           // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
@@ -376,13 +374,13 @@ namespace quda
         {
           // Get link U_{\nu}(x-\nu)
           dx[nu]--;
-          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
           // Get link U_{\mu}(x-\nu)
-          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity);
+          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
 
           // Get link U_{\nu}(x-\nu+\mu)
           dx[mu]++;
-          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity);
+          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity);
 
           // reset dx
           dx[mu]--;
@@ -406,14 +404,11 @@ namespace quda
       typedef Matrix<complex<real>, Arg::nColor> Link;
 
       // compute spacetime and local coords
-      int X[4];
-      for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr];
       int x[4];
-      getCoords(x, x_cb, X, parity);
-      for (int dr = 0; dr < 4; ++dr) {
-        x[dr] += arg.border[dr];
-        X[dr] += 2 * arg.border[dr];
-      }
+      getCoords(x, x_cb, arg.X, parity);
+#pragma unroll
+      for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
+
       int dir_ = dir;
       dir = dir + (dir >= arg.dir_ignore);
 
@@ -421,23 +416,23 @@ namespace quda
       Link U, Stap[2], TestU, I;
 
       // Get link U
-      U = arg.in(dir, linkIndexShift(x, dx, X), parity);
+      U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity);
       setIdentity(&I);
 
       if constexpr (Arg::level == 1) {
-        computeStaple3DLevel1(arg, x, X, parity, dir, Stap, arg.dir_ignore);
+        computeStaple3DLevel1(arg, x, parity, dir, Stap, arg.dir_ignore);
 
         for (int i = 0; i < 2; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
           polarSu3<real>(TestU, arg.tolerance);
-          arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, X), parity) = TestU * U;
+          arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 2) {
-        computeStaple3DLevel2(arg, x, X, parity, dir, Stap, arg.dir_ignore);
+        computeStaple3DLevel2(arg, x, parity, dir, Stap, arg.dir_ignore);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.));
         polarSu3<real>(TestU, arg.tolerance);
-        arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U;
+        arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
       }
     }
   };

From 6685b14cc5b34aeaac88bc5311d3c75ff193bdc3 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Wed, 31 Jan 2024 22:08:19 -0800
Subject: [PATCH 10/22] Partial progress towards HYP cleanup

---
 include/kernels/gauge_hyp.cuh | 143 +++++++++++++++++++++++-----------
 1 file changed, 97 insertions(+), 46 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 8e116dcb65..3ff340d363 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -20,7 +20,7 @@ namespace quda
 
     Gauge out;
     Gauge tmp[4];
-    const Gauge in;
+    Gauge in;
 
     int E[4]; // extended grid dimensions
     int X[4]; // grid dimensions
@@ -46,11 +46,98 @@ namespace quda
     }
   };
 
-  template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const int parity,
-                                                      const int mu, Staple staple[3])
+  template <int mu, int nu, typename Arg>
+  __host__ __device__ inline void computeStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity)
   {
-    using Link = typename get_type<Staple>::type;
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    int dx[4] = {0, 0, 0, 0};
+
+    /* Computes the upper staple :
+     *                 mu (B)
+     *               +-------+
+     *       nu	   |	   |
+     *	     (A)   |	   |(C)
+     *		   X	   X
+     */
+    {
+      /* load matrix A*/
+      Link a = gauge_nu(nu, linkIndex(x, arg.E), parity);
+
+      /* load matrix B*/
+      dx[nu]++;
+      Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity);
+      dx[nu]--;
+
+      /* load matrix C*/
+      dx[mu]++;
+      Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity);
+      dx[mu]--;
+
+      staple = a * b * conj(c);
+    }
+
+    /* Computes the lower staple :
+     *                 X       X
+     *           nu    |       |
+     *	         (A)   |       | (C)
+     *		       +-------+
+     *                  mu (B)
+     */
+    {
+      /* load matrix A*/
+      dx[nu]--;
+      Link a = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity);
+
+      /* load matrix B*/
+      Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity);
+
+      /* load matrix C*/
+      dx[mu]++;
+      Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), parity);
+      dx[mu]--;
+      dx[nu]++;
+
+      staple = staple + conj(a)*b*c;
+    }
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
+          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int mu, int nu)
+  {
+    switch (mu) {
+      case 0:
+        switch (nu) {
+        case 1: computeStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 2: computeStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 3: computeStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        } break;
+      case 1:
+        switch (nu) {
+        case 0: computeStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 2: computeStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 3: computeStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        } break;
+      case 2:
+        switch (nu) {
+        case 0: computeStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 1: computeStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 3: computeStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+      } break;
+      case 3:
+        switch (nu) {
+        case 0: computeStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 1: computeStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 2: computeStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        } break;
+      }
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
     thread_array<int, 4> dx = {};
@@ -64,42 +151,7 @@ namespace quda
 
       cnt += 1;
 
-      {
-        // Get link U_{\nu}(x)
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // Get link U_{\mu}(x+\nu)
-        dx[nu]++;
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[nu]--;
-
-        // Get link U_{\nu}(x+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[mu]--;
-
-        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
-      }
-
-      {
-        // Get link U_{\nu}(x-\nu)
-        dx[nu]--;
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-        // Get link U_{\nu}(x-\nu+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // reset dx
-        dx[mu]--;
-        dx[nu]++;
-
-        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
-      }
+      computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu);
     }
   }
 
@@ -169,12 +221,11 @@ namespace quda
     }
   }
 
-  template <typename Arg, typename Staple>
+  template <typename Arg>
   __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity,
-                                                      const int mu, Staple staple[3])
+                                                      const int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
-    using Link = typename get_type<Staple>::type;
-    staple[0] = Link();
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
 
     thread_array<int, 4> dx = {};
 #pragma unroll
@@ -234,7 +285,7 @@ namespace quda
     __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
     {
       using real = typename Arg::Float;
-      typedef Matrix<complex<real>, Arg::nColor> Link;
+      using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
 
       // compute spacetime and local coords
       int x[4];

From 2640ae923a477c117de464e95d11ee81687ca40c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 08:43:36 -0800
Subject: [PATCH 11/22] Cleanup/bugfixes

---
 include/kernels/gauge_hyp.cuh | 66 +++++++++++++++++++++++++++--------
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 3ff340d363..fd948e57c7 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -3,6 +3,7 @@
 #include <quda_matrix.h>
 #include <su3_project.cuh>
 #include <kernel.h>
+#include <fast_intdiv.h>
 #include <kernels/gauge_utils.cuh>
 
 namespace quda
@@ -22,8 +23,8 @@ namespace quda
     Gauge tmp[4];
     Gauge in;
 
-    int E[4]; // extended grid dimensions
-    int X[4]; // grid dimensions
+    int_fastdiv E[4]; // extended grid dimensions
+    int_fastdiv X[4]; // grid dimensions
     int border[4];
     const Float alpha;
     const int dir_ignore;
@@ -55,9 +56,9 @@ namespace quda
     /* Computes the upper staple :
      *                 mu (B)
      *               +-------+
-     *       nu	   |	   |
-     *	     (A)   |	   |(C)
-     *		   X	   X
+     *         nu    |       |
+     *         (A)   |       |(C)
+     *               X       X
      */
     {
       /* load matrix A*/
@@ -79,9 +80,9 @@ namespace quda
     /* Computes the lower staple :
      *                 X       X
      *           nu    |       |
-     *	         (A)   |       | (C)
-     *		       +-------+
-     *                  mu (B)
+     *           (A)   |       | (C)
+     *                 +-------+
+     *                   mu (B)
      */
     {
       /* load matrix A*/
@@ -152,14 +153,49 @@ namespace quda
       cnt += 1;
 
       computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu);
+
+      /*{
+        // Get link U_{\nu}(x)
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
+
+        // Get link U_{\mu}(x+\nu)
+        dx[nu]++;
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
+        dx[nu]--;
+
+        // Get link U_{\nu}(x+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
+        dx[mu]--;
+
+        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
+        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
+      }
+      {
+        // Get link U_{\nu}(x-\nu)
+        dx[nu]--;
+        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
+        // Get link U_{\mu}(x-\nu)
+        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
+
+        // Get link U_{\nu}(x-\nu+\mu)
+        dx[mu]++;
+        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
+
+        // reset dx
+        dx[mu]--;
+        dx[nu]++;
+        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
+        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
+      }*/
     }
   }
 
-  template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const int parity,
-                                                      const int mu, Staple staple[3])
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
-    using Link = typename get_type<Staple>::type;
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
     thread_array<int, 4> dx = {};
@@ -181,6 +217,8 @@ namespace quda
         const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
         const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
 
+        //computeStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, sigma_with_mu, sigma_with_rho);
+
         {
           // Get link U_{\rho}(x)
           Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity);
@@ -289,7 +327,7 @@ namespace quda
 
       // compute spacetime and local coords
       int x[4];
-      getCoords(x, x_cb, arg.E, parity);
+      getCoords(x, x_cb, arg.X, parity);
 #pragma unroll
       for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
@@ -297,7 +335,7 @@ namespace quda
       Link U, Stap[3], TestU, I;
 
       // Get link U
-      U = arg.in(dir, linkIndexShift(x, dx, arg.X), parity);
+      U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity);
       setIdentity(&I);
 
       if constexpr (Arg::level == 1) {

From 4f2a6254eade5fcedf428c543ddd32c3e381673c Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 09:36:48 -0800
Subject: [PATCH 12/22] Cleaned up/corrected staple abstraction. Still
 generating a stack frame...

---
 include/kernels/gauge_hyp.cuh | 274 ++++++++--------------------------
 1 file changed, 64 insertions(+), 210 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index fd948e57c7..e226a29e2f 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -21,7 +21,7 @@ namespace quda
 
     Gauge out;
     Gauge tmp[4];
-    Gauge in;
+    const Gauge in;
 
     int_fastdiv E[4]; // extended grid dimensions
     int_fastdiv X[4]; // grid dimensions
@@ -47,8 +47,22 @@ namespace quda
     }
   };
 
+  /**
+     @brief Calculates a staple as part of the HYP calculation, returning the product
+
+     @param[in,out] staple The accumulated staple
+     @param[in] arg Kernel argument
+     @param[in] gauge_mu Gauge/tensor field used for the parallel direction
+     @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
+     @param[in] x Full index array
+     @param[in] parity Parity index
+     @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
+     @tparam The {parallel, perpendicular} pair of directions for coordinate offsets
+  */
   template <int mu, int nu, typename Arg>
-  __host__ __device__ inline void computeStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity)
+  __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
+    const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
+    int x[], int parity, int2 tensor_arg)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     int dx[4] = {0, 0, 0, 0};
@@ -62,19 +76,19 @@ namespace quda
      */
     {
       /* load matrix A*/
-      Link a = gauge_nu(nu, linkIndex(x, arg.E), parity);
+      Link a = gauge_nu(tensor_arg.y, linkIndex(x, arg.E), parity);
 
       /* load matrix B*/
       dx[nu]++;
-      Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity);
       dx[nu]--;
 
       /* load matrix C*/
       dx[mu]++;
-      Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity);
       dx[mu]--;
 
-      staple = a * b * conj(c);
+      staple = staple + a * b * conj(c);
     }
 
     /* Computes the lower staple :
@@ -87,14 +101,14 @@ namespace quda
     {
       /* load matrix A*/
       dx[nu]--;
-      Link a = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity);
 
       /* load matrix B*/
-      Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity);
 
       /* load matrix C*/
       dx[mu]++;
-      Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), parity);
+      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), parity);
       dx[mu]--;
       dx[nu]++;
 
@@ -102,34 +116,50 @@ namespace quda
     }
   }
 
+  /**
+     @brief Calculates a staple as part of the HYP calculation, returning the product
+
+     @param[in,out] staple The accumulated staple
+     @param[in] arg Kernel argument
+     @param[in] gauge_mu Gauge/tensor field used for the parallel direction
+     @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
+     @param[in] x Full index array
+     @param[in] parity Parity index
+     @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
+     @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets
+  */
   template <typename Arg>
-  __host__ __device__ inline void computeStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
-          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int mu, int nu)
+  __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
+          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int2 tensor_arg, int2 shifts)
   {
+    // for readability
+    const int mu = shifts.x;
+    const int nu = shifts.y;
+
     switch (mu) {
       case 0:
         switch (nu) {
-        case 1: computeStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 2: computeStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 3: computeStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
         } break;
       case 1:
         switch (nu) {
-        case 0: computeStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 2: computeStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 3: computeStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
         } break;
       case 2:
         switch (nu) {
-        case 0: computeStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 1: computeStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 3: computeStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
       } break;
       case 3:
         switch (nu) {
-        case 0: computeStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 1: computeStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
-        case 2: computeStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break;
+        case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
         } break;
       }
   }
@@ -152,42 +182,8 @@ namespace quda
 
       cnt += 1;
 
-      computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu);
-
-      /*{
-        // Get link U_{\nu}(x)
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // Get link U_{\mu}(x+\nu)
-        dx[nu]++;
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[nu]--;
+      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, { mu, nu }, { mu, nu });
 
-        // Get link U_{\nu}(x+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[mu]--;
-
-        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
-      }
-      {
-        // Get link U_{\nu}(x-\nu)
-        dx[nu]--;
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-        // Get link U_{\nu}(x-\nu+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // reset dx
-        dx[mu]--;
-        dx[nu]++;
-        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
-      }*/
     }
   }
 
@@ -217,51 +213,14 @@ namespace quda
         const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
         const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
 
-        //computeStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, sigma_with_mu, sigma_with_rho);
-
-        {
-          // Get link U_{\rho}(x)
-          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity);
-
-          // Get link U_{\mu}(x+\rho)
-          dx[rho]++;
-          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-          dx[rho]--;
-
-          // Get link U_{\rho}(x+\mu)
-          dx[mu]++;
-          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity);
-          dx[mu]--;
-
-          // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu)
-          staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
-        }
-
-        {
-          // Get link U_{\rho}(x-\rho)
-          dx[rho]--;
-          Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity);
-          // Get link U_{\mu}(x-\rho)
-          Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-          // Get link U_{\rho}(x-\rho+\mu)
-          dx[mu]++;
-          Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity);
-
-          // reset dx
-          dx[mu]--;
-          dx[rho]++;
-
-          // staple += U^\dag_{\rho}(x-\rho) * U_{\mu}(x-\rho) * U_{\rho}(x-\rho+\mu)
-          staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
-        }
+        accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
       }
     }
   }
 
   template <typename Arg>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity,
-                                                      const int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
 
@@ -276,42 +235,7 @@ namespace quda
       const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
       const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
 
-      {
-        // Get link U_{\nu}(x)
-        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // Get link U_{\mu}(x+\nu)
-        dx[nu]++;
-        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[nu]--;
-
-        // Get link U_{\nu}(x+\mu)
-        dx[mu]++;
-        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[mu]--;
-
-        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-        staple[0] = staple[0] + U1 * U2 * conj(U3);
-      }
-
-      {
-        // Get link U_{\nu}(x-\nu)
-        dx[nu]--;
-        Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-        // Get link U_{\nu}(x-\nu+\mu)
-        dx[mu]++;
-        Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // reset dx
-        dx[mu]--;
-        dx[nu]++;
-
-        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-        staple[0] = staple[0] + conj(U1) * U2 * U3;
-      }
+      accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
     }
   }
 
@@ -365,8 +289,8 @@ namespace quda
   };
 
   template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const int parity,
-                                                        const int mu, Staple staple[2], const int dir_ignore)
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], int parity,
+                                                        int mu, Staple staple[2], const int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
     for (int i = 0; i < 2; ++i) staple[i] = Link();
@@ -382,48 +306,13 @@ namespace quda
 
       cnt += 1;
 
-      {
-        // Get link U_{\nu}(x)
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // Get link U_{\mu}(x+\nu)
-        dx[nu]++;
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[nu]--;
-
-        // Get link U_{\nu}(x+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        dx[mu]--;
-
-        // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-        staple[cnt] = staple[cnt] + U1 * U2 * conj(U3);
-      }
-
-      {
-        // Get link U_{\nu}(x-\nu)
-        dx[nu]--;
-        Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-        // Get link U_{\mu}(x-\nu)
-        Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-        // Get link U_{\nu}(x-\nu+\mu)
-        dx[mu]++;
-        Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity);
-
-        // reset dx
-        dx[mu]--;
-        dx[nu]++;
-
-        // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-        staple[cnt] = staple[cnt] + conj(U1) * U2 * U3;
-      }
+      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, {mu, nu}, {mu, nu});
     }
   }
 
   template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const int parity,
-                                                        const int mu, Staple staple[2], int dir_ignore)
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], int parity,
+                                                        int mu, Staple staple[2], int dir_ignore)
   {
     using Link = typename get_type<Staple>::type;
     staple[0] = Link();
@@ -442,42 +331,7 @@ namespace quda
         const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
         const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
 
-        {
-          // Get link U_{\nu}(x)
-          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity);
-
-          // Get link U_{\mu}(x+\nu)
-          dx[nu]++;
-          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-          dx[nu]--;
-
-          // Get link U_{\nu}(x+\mu)
-          dx[mu]++;
-          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-          dx[mu]--;
-
-          // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu)
-          staple[0] = staple[0] + U1 * U2 * conj(U3);
-        }
-
-        {
-          // Get link U_{\nu}(x-\nu)
-          dx[nu]--;
-          Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity);
-          // Get link U_{\mu}(x-\nu)
-          Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity);
-
-          // Get link U_{\nu}(x-\nu+\mu)
-          dx[mu]++;
-          Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity);
-
-          // reset dx
-          dx[mu]--;
-          dx[nu]++;
-
-          // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu)
-          staple[0] = staple[0] + conj(U1) * U2 * U3;
-        }
+        accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
       }
     }
   }

From 0fd5776c63d6d479711218c5f1e43874683f8999 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 10:49:19 -0800
Subject: [PATCH 13/22] Cleanup before getting rid of templating over staples

---
 include/kernels/gauge_hyp.cuh | 114 +++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index e226a29e2f..c07f75eee3 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -55,17 +55,17 @@ namespace quda
      @param[in] gauge_mu Gauge/tensor field used for the parallel direction
      @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
      @param[in] x Full index array
+     @param[in,out] dx Full index offset array; leaves the function with the same values it entered with
      @param[in] parity Parity index
      @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
      @tparam The {parallel, perpendicular} pair of directions for coordinate offsets
   */
-  template <int mu, int nu, typename Arg>
+  template <int mu, int nu, typename Arg, typename Coord>
   __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
     const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
-    int x[], int parity, int2 tensor_arg)
+    int x[], Coord &dx, int parity, int2 tensor_arg)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
-    int dx[4] = {0, 0, 0, 0};
 
     /* Computes the upper staple :
      *                 mu (B)
@@ -124,13 +124,15 @@ namespace quda
      @param[in] gauge_mu Gauge/tensor field used for the parallel direction
      @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
      @param[in] x Full index array
+     @param[in,out] dx Full index offset array; leaves the function with the same values it entered with
      @param[in] parity Parity index
      @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
      @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets
   */
-  template <typename Arg>
+  template <typename Arg, typename Coord>
   __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
-          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int2 tensor_arg, int2 shifts)
+          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
+          int x[], Coord &dx, int parity, int2 tensor_arg, int2 shifts)
   {
     // for readability
     const int mu = shifts.x;
@@ -139,39 +141,38 @@ namespace quda
     switch (mu) {
       case 0:
         switch (nu) {
-        case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
         } break;
       case 1:
         switch (nu) {
-        case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
         } break;
       case 2:
         switch (nu) {
-        case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
       } break;
       case 3:
         switch (nu) {
-        case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
-        case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break;
+        case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
+        case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
         } break;
       }
   }
 
-  template <typename Arg>
-  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], int parity,
+  template <typename Arg, typename Coord>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], Coord &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = {};
     int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
@@ -182,19 +183,18 @@ namespace quda
 
       cnt += 1;
 
-      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, { mu, nu }, { mu, nu });
+      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu });
 
     }
   }
 
-  template <typename Arg>
-  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], int parity,
+  template <typename Arg, typename Coord>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], Coord &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = {};
     int cnt = -1;
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
@@ -213,18 +213,17 @@ namespace quda
         const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
         const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
 
-        accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
+        accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
       }
     }
   }
 
-  template <typename Arg>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], int parity,
+  template <typename Arg, typename Coord>
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], Coord &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
 
-    thread_array<int, 4> dx = {};
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
@@ -235,7 +234,7 @@ namespace quda
       const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
       const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
 
-      accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
+      accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
     }
   }
 
@@ -255,7 +254,8 @@ namespace quda
 #pragma unroll
       for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
-      int dx[4] = {0, 0, 0, 0};
+      thread_array<int, 4> dx = {0, 0, 0, 0};
+
       Link U, Stap[3], TestU, I;
 
       // Get link U
@@ -263,23 +263,25 @@ namespace quda
       setIdentity(&I);
 
       if constexpr (Arg::level == 1) {
-        computeStapleLevel1(arg, x, parity, dir, Stap);
+        computeStapleLevel1(arg, x, dx, parity, dir, Stap);
 
+#pragma unroll
         for (int i = 0; i < 3; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
           polarSu3<real>(TestU, arg.tolerance);
           arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 2) {
-        computeStapleLevel2(arg, x, parity, dir, Stap);
+        computeStapleLevel2(arg, x, dx, parity, dir, Stap);
 
+#pragma unroll
         for (int i = 0; i < 3; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.));
           polarSu3<real>(TestU, arg.tolerance);
           arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 3) {
-        computeStapleLevel3(arg, x, parity, dir, Stap);
+        computeStapleLevel3(arg, x, dx, parity, dir, Stap);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
         polarSu3<real>(TestU, arg.tolerance);
@@ -288,36 +290,33 @@ namespace quda
     }
   };
 
-  template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], int parity,
-                                                        int mu, Staple staple[2], const int dir_ignore)
+  template <typename Arg, typename Coord>
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], Coord &dx, int parity,
+                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], const int dir_ignore)
   {
-    using Link = typename get_type<Staple>::type;
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 2; ++i) staple[i] = Link();
 
-    thread_array<int, 4> dx = {};
-    int cnt = -1;
+    int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
-      // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
-      if (nu == mu || nu == dir_ignore) continue;
-
-      cnt += 1;
-
-      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, {mu, nu}, {mu, nu});
+      if (nu != mu && nu != dir_ignore) {
+        // Identify directions orthogonal to the link and
+        // ignore the dir_ignore direction (usually the temporal dim
+        // when used with STOUT or APE for measurement smearing)
+        accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
+        cnt++;
+      }
     }
   }
 
-  template <typename Arg, typename Staple>
-  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], int parity,
-                                                        int mu, Staple staple[2], int dir_ignore)
+  template <typename Arg, typename Coord>
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], Coord &dx, int parity,
+                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], int dir_ignore)
   {
-    using Link = typename get_type<Staple>::type;
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     staple[0] = Link();
 
-    thread_array<int, 4> dx = {};
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
@@ -331,7 +330,7 @@ namespace quda
         const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
         const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
 
-        accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
+        accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
       }
     }
   }
@@ -352,10 +351,10 @@ namespace quda
 #pragma unroll
       for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
+      thread_array<int, 4> dx = {0, 0, 0, 0};
+
       int dir_ = dir;
       dir = dir + (dir >= arg.dir_ignore);
-
-      int dx[4] = {0, 0, 0, 0};
       Link U, Stap[2], TestU, I;
 
       // Get link U
@@ -363,15 +362,16 @@ namespace quda
       setIdentity(&I);
 
       if constexpr (Arg::level == 1) {
-        computeStaple3DLevel1(arg, x, parity, dir, Stap, arg.dir_ignore);
+        computeStaple3DLevel1(arg, x, dx, parity, dir, Stap, arg.dir_ignore);
 
+#pragma unroll
         for (int i = 0; i < 2; ++i) {
           TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
           polarSu3<real>(TestU, arg.tolerance);
           arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 2) {
-        computeStaple3DLevel2(arg, x, parity, dir, Stap, arg.dir_ignore);
+        computeStaple3DLevel2(arg, x, dx, parity, dir, Stap, arg.dir_ignore);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.));
         polarSu3<real>(TestU, arg.tolerance);

From 5f15d84b41c7b5c910039d5fd938f567e9d1bd82 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 11:54:04 -0800
Subject: [PATCH 14/22] Removed as much stack frame as possible, remaining
 frame is due to register count getting maxed out

---
 include/kernels/gauge_hyp.cuh | 187 ++++++++++++++--------------------
 1 file changed, 79 insertions(+), 108 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index c07f75eee3..9c18cabd59 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -50,7 +50,6 @@ namespace quda
   /**
      @brief Calculates a staple as part of the HYP calculation, returning the product
 
-     @param[in,out] staple The accumulated staple
      @param[in] arg Kernel argument
      @param[in] gauge_mu Gauge/tensor field used for the parallel direction
      @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
@@ -58,14 +57,20 @@ namespace quda
      @param[in,out] dx Full index offset array; leaves the function with the same values it entered with
      @param[in] parity Parity index
      @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
-     @tparam The {parallel, perpendicular} pair of directions for coordinate offsets
+     @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets
+     @return The computed staple
   */
-  template <int mu, int nu, typename Arg, typename Coord>
-  __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
-    const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
-    int x[], Coord &dx, int parity, int2 tensor_arg)
+  template <typename Arg>
+  __host__ __device__ inline Matrix<complex<typename Arg::Float>, Arg::nColor> accumulateStaple(const Arg &arg,
+          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
+          int x[], thread_array<int, 4> &dx, int parity, int2 tensor_arg, int2 shifts)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    Link staple;
+
+    // for readability
+    const int mu = shifts.x;
+    const int nu = shifts.y;
 
     /* Computes the upper staple :
      *                 mu (B)
@@ -88,7 +93,7 @@ namespace quda
       Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity);
       dx[mu]--;
 
-      staple = staple + a * b * conj(c);
+      staple = a * b * conj(c);
     }
 
     /* Computes the lower staple :
@@ -114,113 +119,74 @@ namespace quda
 
       staple = staple + conj(a)*b*c;
     }
-  }
-
-  /**
-     @brief Calculates a staple as part of the HYP calculation, returning the product
-
-     @param[in,out] staple The accumulated staple
-     @param[in] arg Kernel argument
-     @param[in] gauge_mu Gauge/tensor field used for the parallel direction
-     @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
-     @param[in] x Full index array
-     @param[in,out] dx Full index offset array; leaves the function with the same values it entered with
-     @param[in] parity Parity index
-     @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
-     @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets
-  */
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void accumulateStaple(Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, const Arg &arg,
-          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
-          int x[], Coord &dx, int parity, int2 tensor_arg, int2 shifts)
-  {
-    // for readability
-    const int mu = shifts.x;
-    const int nu = shifts.y;
 
-    switch (mu) {
-      case 0:
-        switch (nu) {
-        case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        } break;
-      case 1:
-        switch (nu) {
-        case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        } break;
-      case 2:
-        switch (nu) {
-        case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-      } break;
-      case 3:
-        switch (nu) {
-        case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break;
-        } break;
-      }
+    return staple;
   }
 
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], Coord &dx, int parity,
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    int cnt = -1;
+    int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
       // Identify directions orthogonal to the link and
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
-      if (nu == mu) continue;
-
-      cnt += 1;
-
-      accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu });
-
+      if (nu != mu) {
+        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu });
+        switch (cnt) {
+        case 0: staple[0] = staple[0] + accum; break;
+        case 1: staple[1] = staple[1] + accum; break;
+        case 2: staple[2] = staple[2] + accum; break;
+        }
+        cnt++;
+      }
     }
   }
 
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], Coord &dx, int parity,
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 3; ++i) staple[i] = Link();
 
-    int cnt = -1;
+    int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
-      if (nu == mu) continue;
-
-      cnt += 1;
-
-      for (int rho = 0; rho < 4; ++rho) {
-        if (rho == mu || rho == nu) continue;
-        int sigma = 0;
-        while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
-
-        const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
-        const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
+      if (nu != mu) {
+#pragma unroll
+        for (int rho = 0; rho < 4; ++rho) {
+          if (rho == mu || rho == nu) continue;
+          int sigma = 0;
+          while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
+
+          const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
+          const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
+
+          Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
+          switch (cnt) {
+          case 0: staple[0] = staple[0] + accum; break;
+          case 1: staple[1] = staple[1] + accum; break;
+          case 2: staple[2] = staple[2] + accum; break;
+          }
+        }
 
-        accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
+        cnt++;
       }
     }
   }
 
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], Coord &dx, int parity,
-                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
 
@@ -229,12 +195,12 @@ namespace quda
       // Identify directions orthogonal to the link and
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
-      if (nu == mu) continue;
+      if (nu != mu) {
+        const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
+        const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
 
-      const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
-      const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
-
-      accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
+        staple = staple + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
+      }
     }
   }
 
@@ -281,7 +247,7 @@ namespace quda
           arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 3) {
-        computeStapleLevel3(arg, x, dx, parity, dir, Stap);
+        computeStapleLevel3(arg, x, dx, parity, dir, Stap[0]);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
         polarSu3<real>(TestU, arg.tolerance);
@@ -290,8 +256,8 @@ namespace quda
     }
   };
 
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], Coord &dx, int parity,
+  template <typename Arg>
+  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
                                                         int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], const int dir_ignore)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
@@ -300,37 +266,42 @@ namespace quda
     int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim
+      // when used with STOUT or APE for measurement smearing)
       if (nu != mu && nu != dir_ignore) {
-        // Identify directions orthogonal to the link and
-        // ignore the dir_ignore direction (usually the temporal dim
-        // when used with STOUT or APE for measurement smearing)
-        accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
+        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
+        switch (cnt) {
+        case 0: staple[0] = staple[0] + accum; break;
+        case 1: staple[1] = staple[1] + accum; break;
+        }
         cnt++;
       }
     }
   }
 
-  template <typename Arg, typename Coord>
-  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], Coord &dx, int parity,
-                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], int dir_ignore)
+  template <typename Arg>
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, int dir_ignore)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
-    staple[0] = Link();
+    staple = Link();
 
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
-      if (nu == mu || nu == dir_ignore) continue;
-
-      for (int rho = 0; rho < 4; ++rho) {
-        if (rho == mu || rho == nu || rho == dir_ignore) continue;
-
-        const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
-        const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
+      if (nu != mu && nu != dir_ignore) {
+#pragma unroll
+        for (int rho = 0; rho < 4; ++rho) {
+          if (rho != mu && rho != nu && rho != dir_ignore) {
+            const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
+            const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
 
-        accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
+            staple = staple + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
+          }
+        }
       }
     }
   }
@@ -371,7 +342,7 @@ namespace quda
           arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
         }
       } else if constexpr (Arg::level == 2) {
-        computeStaple3DLevel2(arg, x, dx, parity, dir, Stap, arg.dir_ignore);
+        computeStaple3DLevel2(arg, x, dx, parity, dir, Stap[0], arg.dir_ignore);
 
         TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.));
         polarSu3<real>(TestU, arg.tolerance);

From a3fae46a8413d615ded224e8f899db2f4012c366 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 12:53:34 -0800
Subject: [PATCH 15/22] Updated timers

---
 lib/gauge_hyp.cu | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 2090d19dd5..aea4723309 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -128,20 +128,40 @@ namespace quda
     if (dir_ignore == 4) {
       copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
       in.exchangeExtendedGhost(in.R(), false);
+
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
       tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
       tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false);
+
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
       tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false);
       tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false);
+
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha1, 3, dir_ignore);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
       out.exchangeExtendedGhost(out.R(), false);
     } else {
       copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
       in.exchangeExtendedGhost(in.R(), false);
+
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
       tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
+
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
       out.exchangeExtendedGhost(out.R(), false);
     }
 

From d4a24e5422969d9406a9a6ebea1bd4b7c0df88e5 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Thu, 1 Feb 2024 14:00:19 -0800
Subject: [PATCH 16/22] clang-format

---
 include/gauge_tools.h         |  3 +--
 include/kernels/gauge_hyp.cuh | 42 ++++++++++++++++++++---------------
 lib/gauge_hyp.cu              | 12 +++++-----
 lib/interface_quda.cpp        |  3 +--
 4 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 4c782d0ddc..f6bc3f22b9 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -129,8 +129,7 @@ namespace quda
      @param[in] alpha3 smearing parameter
      @param[in] dir_ignore ignored direction
   */
-  void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2,
-               double alpha3, int dir_ignore);
+  void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore);
 
   /**
      @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field.
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 9c18cabd59..217958c935 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -30,7 +30,7 @@ namespace quda
     const int dir_ignore;
     const Float tolerance;
 
-    GaugeHYPArg(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
+    GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
       kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)),
       out(out),
       tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
@@ -61,9 +61,9 @@ namespace quda
      @return The computed staple
   */
   template <typename Arg>
-  __host__ __device__ inline Matrix<complex<typename Arg::Float>, Arg::nColor> accumulateStaple(const Arg &arg,
-          const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu,
-          int x[], thread_array<int, 4> &dx, int parity, int2 tensor_arg, int2 shifts)
+  __host__ __device__ inline Matrix<complex<typename Arg::Float>, Arg::nColor>
+  accumulateStaple(const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[],
+                   thread_array<int, 4> &dx, int parity, int2 tensor_arg, int2 shifts)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     Link staple;
@@ -85,12 +85,12 @@ namespace quda
 
       /* load matrix B*/
       dx[nu]++;
-      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity);
       dx[nu]--;
 
       /* load matrix C*/
       dx[mu]++;
-      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity);
       dx[mu]--;
 
       staple = a * b * conj(c);
@@ -106,10 +106,10 @@ namespace quda
     {
       /* load matrix A*/
       dx[nu]--;
-      Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity);
 
       /* load matrix B*/
-      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity);
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity);
 
       /* load matrix C*/
       dx[mu]++;
@@ -117,7 +117,7 @@ namespace quda
       dx[mu]--;
       dx[nu]++;
 
-      staple = staple + conj(a)*b*c;
+      staple = staple + conj(a) * b * c;
     }
 
     return staple;
@@ -137,7 +137,7 @@ namespace quda
       // ignore the dir_ignore direction (usually the temporal dim
       // when used with STOUT or APE for measurement smearing)
       if (nu != mu) {
-        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu });
+        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
         switch (cnt) {
         case 0: staple[0] = staple[0] + accum; break;
         case 1: staple[1] = staple[1] + accum; break;
@@ -171,7 +171,8 @@ namespace quda
           const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
           const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
 
-          Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho});
+          Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity,
+                                        {sigma_with_mu, sigma_with_rho}, {mu, rho});
           switch (cnt) {
           case 0: staple[0] = staple[0] + accum; break;
           case 1: staple[1] = staple[1] + accum; break;
@@ -199,7 +200,9 @@ namespace quda
         const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
         const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
 
-        staple = staple + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu});
+        staple = staple
+          + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu},
+                             {mu, nu});
       }
     }
   }
@@ -218,7 +221,7 @@ namespace quda
       int x[4];
       getCoords(x, x_cb, arg.X, parity);
 #pragma unroll
-      for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
+      for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
       thread_array<int, 4> dx = {0, 0, 0, 0};
 
@@ -257,8 +260,9 @@ namespace quda
   };
 
   template <typename Arg>
-  __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
-                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], const int dir_ignore)
+  __host__ __device__ inline void
+  computeStaple3DLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity, int mu,
+                        Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], const int dir_ignore)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     for (int i = 0; i < 2; ++i) staple[i] = Link();
@@ -282,7 +286,8 @@ namespace quda
 
   template <typename Arg>
   __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
-                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple, int dir_ignore)
+                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple,
+                                                        int dir_ignore)
   {
     using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
     staple = Link();
@@ -299,7 +304,8 @@ namespace quda
             const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
             const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
 
-            staple = staple + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
+            staple = staple
+              + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
           }
         }
       }
@@ -320,7 +326,7 @@ namespace quda
       int x[4];
       getCoords(x, x_cb, arg.X, parity);
 #pragma unroll
-      for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
+      for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
 
       thread_array<int, 4> dx = {0, 0, 0, 0};
 
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index aea4723309..9097a9a3c3 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -10,7 +10,7 @@ namespace quda
   template <typename Float, int nColor, QudaReconstructType recon> class GaugeHYP : TunableKernel3D
   {
     GaugeField &out;
-    GaugeField* tmp[4];
+    GaugeField *tmp[4];
     const GaugeField &in;
     const Float alpha;
     const int level;
@@ -21,7 +21,7 @@ namespace quda
 
   public:
     // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
-    GaugeHYP(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
+    GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       tmp {tmp[0], tmp[1], tmp[2], tmp[3]},
@@ -98,8 +98,7 @@ namespace quda
 
   }; // GaugeAPE
 
-  void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3,
-               int dir_ignore)
+  void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
@@ -108,12 +107,12 @@ namespace quda
     GaugeFieldParam gParam(out);
     gParam.location = QUDA_CUDA_FIELD_LOCATION;
     const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4;
-    //GaugeField tmp[4];
+    // GaugeField tmp[4];
     gParam.geometry = QUDA_TENSOR_GEOMETRY;
     GaugeFieldParam gParam2(gParam);
     gParam2.create = QUDA_REFERENCE_FIELD_CREATE;
 
-    GaugeField* tmp[4];
+    GaugeField *tmp[4];
     if (smearDim == 3) {
       tmp[0] = new GaugeField(gParam);
       // aux[1], aux[2] and aux[3] will not be used for smearDim == 3
@@ -166,7 +165,6 @@ namespace quda
     }
 
     for (int i = 0; i < 4; i++) delete tmp[i];
-
   }
 
 } // namespace quda
\ No newline at end of file
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 073ddb6f63..b44d4c534b 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5094,8 +5094,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
       OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
       break;
     case QUDA_GAUGE_SMEAR_HYP:
-      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3,
-              smear_param->dir_ignore);
+      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }

From 672177670ebb583c9b3f10d79f583291625917a9 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 5 Feb 2024 09:21:45 -0800
Subject: [PATCH 17/22] Clang error: unused "using"

---
 include/kernels/gauge_hyp.cuh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 9c18cabd59..a092b2ec0e 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -188,8 +188,6 @@ namespace quda
   __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
                                                       int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple)
   {
-    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
-
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and

From a2326f87e7680c4679a0c09ba22980ecffd7fb16 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 5 Feb 2024 09:27:26 -0800
Subject: [PATCH 18/22] Removed superfluous getProfile calls

---
 lib/gauge_hyp.cu | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index aea4723309..0e7e79c4fa 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -125,45 +125,27 @@ namespace quda
 
     if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
 
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (dir_ignore == 4) {
       copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
       in.exchangeExtendedGhost(in.R(), false);
-
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
-      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-
       tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
       tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false);
-
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
-      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-
       tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false);
       tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false);
-
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha1, 3, dir_ignore);
-      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-
       out.exchangeExtendedGhost(out.R(), false);
     } else {
       copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
       in.exchangeExtendedGhost(in.R(), false);
-
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
-      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-
       tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
-
-      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
-      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-
       out.exchangeExtendedGhost(out.R(), false);
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
 
     for (int i = 0; i < 4; i++) delete tmp[i];
 

From 0892e264aae823da98f06e12c1e9cf21c333438a Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Tue, 6 Feb 2024 14:45:06 +0800
Subject: [PATCH 19/22] Remove unused param to make compiler happy.

---
 lib/gauge_hyp.cu | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 9aa2995b9f..051ea2962b 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -104,25 +104,22 @@ namespace quda
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     GaugeFieldParam gParam(out);
     gParam.location = QUDA_CUDA_FIELD_LOCATION;
-    const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4;
-    // GaugeField tmp[4];
     gParam.geometry = QUDA_TENSOR_GEOMETRY;
-    GaugeFieldParam gParam2(gParam);
-    gParam2.create = QUDA_REFERENCE_FIELD_CREATE;
 
     GaugeField *tmp[4];
-    if (smearDim == 3) {
+    if (dir_ignore == 4) { // hypDim == 4
+      for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
+    } else { // hypDim == 3
       tmp[0] = new GaugeField(gParam);
-      // aux[1], aux[2] and aux[3] will not be used for smearDim == 3
+      // tmp[1], tmp[2] and tmp[3] will not be used for hypDim == 3
       gParam.create = QUDA_REFERENCE_FIELD_CREATE;
       for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
-    } else {
-      for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
     }
 
-    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
 
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (dir_ignore == 4) {

From 16889318aecced86cbac1f4c467af978784f3a9a Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Tue, 6 Feb 2024 15:12:07 +0800
Subject: [PATCH 20/22] clang-14 + nvcc-11.8 think variables like
 `sigma_with_mu` unused. They might not happy with implicit conversion to
 `int2`.

---
 include/kernels/gauge_hyp.cuh | 8 ++++----
 lib/gauge_hyp.cu              | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 574a35d3f4..22843fce09 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -172,7 +172,7 @@ namespace quda
           const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
 
           Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity,
-                                        {sigma_with_mu, sigma_with_rho}, {mu, rho});
+                                        int2 {sigma_with_mu, sigma_with_rho}, {mu, rho});
           switch (cnt) {
           case 0: staple[0] = staple[0] + accum; break;
           case 1: staple[1] = staple[1] + accum; break;
@@ -199,8 +199,8 @@ namespace quda
         const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
 
         staple = staple
-          + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu},
-                             {mu, nu});
+          + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity,
+                             int2 {nu_with_mu, mu_with_nu}, {mu, nu});
       }
     }
   }
@@ -303,7 +303,7 @@ namespace quda
             const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
 
             staple = staple
-              + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu});
+              + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, int2 {rho_with_mu, rho_with_nu}, {mu, nu});
           }
         }
       }
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 051ea2962b..142459d0cb 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -120,7 +120,6 @@ namespace quda
       for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
     }
 
-
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (dir_ignore == 4) {
       copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);

From 74f9a236a063f32955d262d0ea67073fdee54629 Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Tue, 6 Feb 2024 15:38:10 +0800
Subject: [PATCH 21/22] Update comments.

---
 include/kernels/gauge_hyp.cuh | 42 +++++++++++++----------------------
 include/quda.h                |  6 ++---
 lib/gauge_ape.cu              |  2 +-
 lib/gauge_hyp.cu              |  2 +-
 lib/gauge_stout.cu            |  2 +-
 5 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
index 22843fce09..c444f86169 100644
--- a/include/kernels/gauge_hyp.cuh
+++ b/include/kernels/gauge_hyp.cuh
@@ -133,9 +133,6 @@ namespace quda
     int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
-      // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
       if (nu != mu) {
         Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
         switch (cnt) {
@@ -158,25 +155,23 @@ namespace quda
     int cnt = 0;
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
-      // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
       if (nu != mu) {
 #pragma unroll
         for (int rho = 0; rho < 4; ++rho) {
-          if (rho == mu || rho == nu) continue;
-          int sigma = 0;
-          while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
-
-          const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
-          const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
-
-          Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity,
-                                        int2 {sigma_with_mu, sigma_with_rho}, {mu, rho});
-          switch (cnt) {
-          case 0: staple[0] = staple[0] + accum; break;
-          case 1: staple[1] = staple[1] + accum; break;
-          case 2: staple[2] = staple[2] + accum; break;
+          if (rho != mu && rho != nu) {
+            int sigma = 0;
+            while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
+
+            const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
+            const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
+
+            Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity,
+                                          int2 {sigma_with_mu, sigma_with_rho}, {mu, rho});
+            switch (cnt) {
+            case 0: staple[0] = staple[0] + accum; break;
+            case 1: staple[1] = staple[1] + accum; break;
+            case 2: staple[2] = staple[2] + accum; break;
+            }
           }
         }
 
@@ -191,9 +186,6 @@ namespace quda
   {
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
-      // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
       if (nu != mu) {
         const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
         const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
@@ -269,8 +261,7 @@ namespace quda
 #pragma unroll
     for (int nu = 0; nu < 4; ++nu) {
       // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
+      // ignore the dir_ignore direction (usually the temporal dim)
       if (nu != mu && nu != dir_ignore) {
         Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
         switch (cnt) {
@@ -293,8 +284,7 @@ namespace quda
 #pragma unroll
     for (int nu = 0; nu < 4; nu++) {
       // Identify directions orthogonal to the link and
-      // ignore the dir_ignore direction (usually the temporal dim
-      // when used with STOUT or APE for measurement smearing)
+      // ignore the dir_ignore direction (usually the temporal dim)
       if (nu != mu && nu != dir_ignore) {
 #pragma unroll
         for (int rho = 0; rho < 4; ++rho) {
diff --git a/include/quda.h b/include/quda.h
index 4469481d86..2a5bffa13a 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -846,9 +846,9 @@ extern "C" {
                              Wilson/Symanzik flow */
     double alpha;         /**< The single coefficient used in APE smearing */
     double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */
-    double alpha1;                 /**< The first coefficient used in HYP smearing */
-    double alpha2;                 /**< The second coefficient used in HYP smearing */
-    double alpha3;                 /**< The third coefficient used in HYP smearing */
+    double alpha1;                 /**< The coefficient used in HYP smearing step 3 (will not be used in 3D smearing)*/
+    double alpha2;                 /**< The coefficient used in HYP smearing step 2*/
+    double alpha3;                 /**< The coefficient used in HYP smearing step 1*/
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
     QudaBoolean restart;           /**< Used to restart the smearing from existing gaugeSmeared */
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 8d61066041..2174e3f9b2 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -17,7 +17,7 @@ namespace quda {
     unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
     GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
index 142459d0cb..4b2c1e3a65 100644
--- a/lib/gauge_hyp.cu
+++ b/lib/gauge_hyp.cu
@@ -20,7 +20,7 @@ namespace quda
     unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
     GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 2cf1c4e634..b3433784ce 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -25,7 +25,7 @@ namespace quda {
     }
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
     GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) :
       TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),

From 2b5a7c2f0f0a4e0f22654ae18aeaf064b54e038f Mon Sep 17 00:00:00 2001
From: SaltyChiang <SaltyChiang@users.noreply.github.com>
Date: Wed, 7 Feb 2024 02:50:27 +0800
Subject: [PATCH 22/22] Use `dir_ignore < 0` to represent the default
 situation. 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP.

---
 include/quda.h         |  3 ++-
 lib/check_params.h     |  2 +-
 lib/interface_quda.cpp | 15 +++++++++++----
 tests/su3_test.cpp     |  7 ++++---
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/quda.h b/include/quda.h
index 2a5bffa13a..de88d1d0d2 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -853,7 +853,8 @@ extern "C" {
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
     QudaBoolean restart;           /**< Used to restart the smearing from existing gaugeSmeared */
     double t0;                     /**< Starting flow time for Wilson flow */
-    int dir_ignore;                /**< The direction to be ignored by the smearing algorithm */
+    int dir_ignore;                /**< The direction to be ignored by the smearing algorithm
+                                        A negative value means 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP */
   } QudaGaugeSmearParam;
 
   typedef struct QudaBLASParam_s {
diff --git a/lib/check_params.h b/lib/check_params.h
index fc4d80d315..86800b0e7e 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -1105,7 +1105,7 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(alpha1, 0.0);
   P(alpha2, 0.0);
   P(alpha3, 0.0);
-  P(dir_ignore, 4);
+  P(dir_ignore, -1);
 #else
   P(n_steps, (unsigned int)INVALID_INT);
   P(meas_interval, (unsigned int)INVALID_INT);
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index b44d4c534b..f8c2fcda74 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -5086,15 +5086,22 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   gaugeObservablesQuda(&obs_param[measurement_n]);
   logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
 
+  // set default dir_ignore = 3 for APE and STOUT for compatibility
+  int dir_ignore = smear_param->dir_ignore;
+  if (dir_ignore < 0
+      && (smear_param->smear_type == QUDA_GAUGE_SMEAR_APE || smear_param->smear_type == QUDA_GAUGE_SMEAR_STOUT)) {
+    dir_ignore = 3;
+  }
+
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     switch (smear_param->smear_type) {
-    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, smear_param->dir_ignore); break;
-    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->dir_ignore); break;
+    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, dir_ignore); break;
+    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, dir_ignore); break;
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
-      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore);
+      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, dir_ignore);
       break;
     case QUDA_GAUGE_SMEAR_HYP:
-      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore);
+      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index acd2803b42..265fed1248 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -27,7 +27,7 @@ double gauge_smear_alpha2 = 0.6;
 double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 50;
 QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
-int gauge_smear_dir_ignore = 4;
+int gauge_smear_dir_ignore = -1;
 int measurement_interval = 5;
 bool su_project = true;
 
@@ -98,8 +98,9 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
   opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
 
-  opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore,
-                      "direction to be ignored by the smearing (default 4)");
+  opgroup->add_option(
+    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
+    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
 
   opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");