diff --git a/include/enum_quda.h b/include/enum_quda.h
index c4cbb59901..60c186eb77 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -571,6 +571,7 @@ typedef enum QudaGaugeSmearType_s {
   QUDA_GAUGE_SMEAR_APE,
   QUDA_GAUGE_SMEAR_STOUT,
   QUDA_GAUGE_SMEAR_OVRIMP_STOUT,
+  QUDA_GAUGE_SMEAR_HYP,
   QUDA_GAUGE_SMEAR_WILSON_FLOW,
   QUDA_GAUGE_SMEAR_SYMANZIK_FLOW,
   QUDA_GAUGE_SMEAR_INVALID = QUDA_INVALID_ENUM
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 6a8708948a..ada17e2cc8 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -504,8 +504,9 @@
 #define QUDA_GAUGE_SMEAR_APE 0
 #define QUDA_GAUGE_SMEAR_STOUT 1
 #define QUDA_GAUGE_SMEAR_OVRIMP_STOUT 2
-#define QUDA_GAUGE_SMEAR_WILSON_FLOW 3
-#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 4
+#define QUDA_GAUGE_SMEAR_HYP 3
+#define QUDA_GAUGE_SMEAR_WILSON_FLOW 4
+#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 5
 #define QUDA_GAUGE_SMEAR_INVALID QUDA_INVALID_ENUM
 
 #define QudaFermionSmearType integer(4)
diff --git a/include/gauge_field.h b/include/gauge_field.h
index ffea94c22b..a62b6a684f 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -638,6 +638,19 @@ namespace quda {
      */
     static bool are_compatible_weak(const GaugeField &a, const GaugeField &b);
 
+    /**
+     * @brief Helper function that returns the default tolerance used by SU(3) projection
+     * @return The default tolerance, which is ~10x epsilon
+     */
+    double toleranceSU3() const
+    {
+      switch (precision) {
+      case QUDA_DOUBLE_PRECISION: return 2e-15;
+      case QUDA_SINGLE_PRECISION: return 1e-6;
+      default: return 1e-6;
+      }
+    }
+
     friend struct GaugeFieldParam;
   };
 
diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 9b7d68db37..f6bc3f22b9 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -95,8 +95,9 @@ namespace quda
      @param[out] dataDs Output smeared field
      @param[in] dataOr Input gauge field
      @param[in] alpha smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha);
+  void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha, int dir_ignore);
 
   /**
      @brief Apply STOUT smearing to the gauge field
@@ -104,8 +105,9 @@ namespace quda
      @param[out] dataDs Output smeared field
      @param[in] dataOr Input gauge field
      @param[in] rho smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho);
+  void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, int dir_ignore);
 
   /**
      @brief Apply Over Improved STOUT smearing to the gauge field
@@ -114,8 +116,20 @@ namespace quda
      @param[in] dataOr Input gauge field
      @param[in] rho smearing parameter
      @param[in] epsilon smearing parameter
+     @param[in] dir_ignore ignored direction
   */
-  void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon);
+  void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore);
+
+  /**
+     @brief Apply HYP smearing to the gauge field
+     @param[out] dataDs Output smeared field
+     @param[in] dataOr Input gauge field
+     @param[in] alpha1 smearing parameter
+     @param[in] alpha2 smearing parameter
+     @param[in] alpha3 smearing parameter
+     @param[in] dir_ignore ignored direction
+  */
+  void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore);
 
   /**
      @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field.
diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh
index 3bf490dfea..f8e00d03db 100644
--- a/include/kernels/gauge_ape.cuh
+++ b/include/kernels/gauge_ape.cuh
@@ -8,9 +8,6 @@
 namespace quda
 {
 
-#define  DOUBLE_TOL	1e-15
-#define  SINGLE_TOL	2e-6
-
   template <typename Float_, int nColor_, QudaReconstructType recon_, int apeDim_>
   struct GaugeAPEArg : kernel_param<> {
     using Float = Float_;
@@ -26,14 +23,16 @@ namespace quda
     int X[4];    // grid dimensions
     int border[4];
     const Float alpha;
+    const int dir_ignore;
     const Float tolerance;
 
-    GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha) :
+    GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) :
       kernel_param(dim3(in.LocalVolumeCB(), 2, apeDim)),
       out(out),
       in(in),
       alpha(alpha),
-      tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL)
+      dir_ignore(dir_ignore),
+      tolerance(in.toleranceSU3())
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
@@ -61,16 +60,17 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       int dx[4] = {0, 0, 0, 0};
       Link U, Stap, TestU, I;
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
-      computeStaple(arg, x, X, parity, dir, Stap, Arg::apeDim);
+      computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndexShift(x, dx, X), parity);
-    
-      Stap = Stap * (arg.alpha / ((real)(2. * (3. - 1.))));
+
+      Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1))));
       setIdentity(&I);
 
       TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap * conj(U);
diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh
new file mode 100644
index 0000000000..c444f86169
--- /dev/null
+++ b/include/kernels/gauge_hyp.cuh
@@ -0,0 +1,347 @@
+#include <gauge_field_order.h>
+#include <index_helper.cuh>
+#include <quda_matrix.h>
+#include <su3_project.cuh>
+#include <kernel.h>
+#include <fast_intdiv.h>
+#include <kernels/gauge_utils.cuh>
+
+namespace quda
+{
+
+  template <typename Float_, int nColor_, QudaReconstructType recon_, int level_, int hypDim_>
+  struct GaugeHYPArg : kernel_param<> {
+    using Float = Float_;
+    static constexpr int nColor = nColor_;
+    static_assert(nColor == 3, "Only nColor=3 enabled at this time");
+    static constexpr QudaReconstructType recon = recon_;
+    static constexpr int hypDim = hypDim_;
+    static constexpr int level = level_;
+    typedef typename gauge_mapper<Float, recon>::type Gauge;
+
+    Gauge out;
+    Gauge tmp[4];
+    const Gauge in;
+
+    int_fastdiv E[4]; // extended grid dimensions
+    int_fastdiv X[4]; // grid dimensions
+    int border[4];
+    const Float alpha;
+    const int dir_ignore;
+    const Float tolerance;
+
+    GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) :
+      kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)),
+      out(out),
+      tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]},
+      in(in),
+      alpha(alpha),
+      dir_ignore(dir_ignore),
+      tolerance(in.toleranceSU3())
+    {
+      for (int dir = 0; dir < 4; ++dir) {
+        border[dir] = in.R()[dir];
+        E[dir] = in.X()[dir];
+        X[dir] = in.X()[dir] - border[dir] * 2;
+      }
+    }
+  };
+
+  /**
+     @brief Calculates a staple as part of the HYP calculation, returning the product
+
+     @param[in] arg Kernel argument
+     @param[in] gauge_mu Gauge/tensor field used for the parallel direction
+     @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction
+     @param[in] x Full index array
+     @param[in,out] dx Full index offset array; leaves the function with the same values it entered with
+     @param[in] parity Parity index
+     @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields
+     @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets
+     @return The computed staple
+  */
+  template <typename Arg>
+  __host__ __device__ inline Matrix<complex<typename Arg::Float>, Arg::nColor>
+  accumulateStaple(const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[],
+                   thread_array<int, 4> &dx, int parity, int2 tensor_arg, int2 shifts)
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    Link staple;
+
+    // for readability
+    const int mu = shifts.x;
+    const int nu = shifts.y;
+
+    /* Computes the upper staple :
+     *                 mu (B)
+     *               +-------+
+     *         nu    |       |
+     *         (A)   |       |(C)
+     *               X       X
+     */
+    {
+      /* load matrix A*/
+      Link a = gauge_nu(tensor_arg.y, linkIndex(x, arg.E), parity);
+
+      /* load matrix B*/
+      dx[nu]++;
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity);
+      dx[nu]--;
+
+      /* load matrix C*/
+      dx[mu]++;
+      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity);
+      dx[mu]--;
+
+      staple = a * b * conj(c);
+    }
+
+    /* Computes the lower staple :
+     *                 X       X
+     *           nu    |       |
+     *           (A)   |       | (C)
+     *                 +-------+
+     *                   mu (B)
+     */
+    {
+      /* load matrix A*/
+      dx[nu]--;
+      Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity);
+
+      /* load matrix B*/
+      Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity);
+
+      /* load matrix C*/
+      dx[mu]++;
+      Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), parity);
+      dx[mu]--;
+      dx[nu]++;
+
+      staple = staple + conj(a) * b * c;
+    }
+
+    return staple;
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    for (int i = 0; i < 3; ++i) staple[i] = Link();
+
+    int cnt = 0;
+#pragma unroll
+    for (int nu = 0; nu < 4; ++nu) {
+      if (nu != mu) {
+        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
+        switch (cnt) {
+        case 0: staple[0] = staple[0] + accum; break;
+        case 1: staple[1] = staple[1] + accum; break;
+        case 2: staple[2] = staple[2] + accum; break;
+        }
+        cnt++;
+      }
+    }
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> staple[3])
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    for (int i = 0; i < 3; ++i) staple[i] = Link();
+
+    int cnt = 0;
+#pragma unroll
+    for (int nu = 0; nu < 4; nu++) {
+      if (nu != mu) {
+#pragma unroll
+        for (int rho = 0; rho < 4; ++rho) {
+          if (rho != mu && rho != nu) {
+            int sigma = 0;
+            while (sigma == mu || sigma == nu || sigma == rho) sigma += 1;
+
+            const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho);
+            const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu);
+
+            Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity,
+                                          int2 {sigma_with_mu, sigma_with_rho}, {mu, rho});
+            switch (cnt) {
+            case 0: staple[0] = staple[0] + accum; break;
+            case 1: staple[1] = staple[1] + accum; break;
+            case 2: staple[2] = staple[2] + accum; break;
+            }
+          }
+        }
+
+        cnt++;
+      }
+    }
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                      int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple)
+  {
+#pragma unroll
+    for (int nu = 0; nu < 4; nu++) {
+      if (nu != mu) {
+        const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu);
+        const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu);
+
+        staple = staple
+          + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity,
+                             int2 {nu_with_mu, mu_with_nu}, {mu, nu});
+      }
+    }
+  }
+
+  template <typename Arg> struct HYP {
+    const Arg &arg;
+    constexpr HYP(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
+    {
+      using real = typename Arg::Float;
+      using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+
+      // compute spacetime and local coords
+      int x[4];
+      getCoords(x, x_cb, arg.X, parity);
+#pragma unroll
+      for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
+
+      thread_array<int, 4> dx = {0, 0, 0, 0};
+
+      Link U, Stap[3], TestU, I;
+
+      // Get link U
+      U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity);
+      setIdentity(&I);
+
+      if constexpr (Arg::level == 1) {
+        computeStapleLevel1(arg, x, dx, parity, dir, Stap);
+
+#pragma unroll
+        for (int i = 0; i < 3; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 2) {
+        computeStapleLevel2(arg, x, dx, parity, dir, Stap);
+
+#pragma unroll
+        for (int i = 0; i < 3; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 3) {
+        computeStapleLevel3(arg, x, dx, parity, dir, Stap[0]);
+
+        TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.));
+        polarSu3<real>(TestU, arg.tolerance);
+        arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
+      }
+    }
+  };
+
+  template <typename Arg>
+  __host__ __device__ inline void
+  computeStaple3DLevel1(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity, int mu,
+                        Matrix<complex<typename Arg::Float>, Arg::nColor> staple[2], const int dir_ignore)
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    for (int i = 0; i < 2; ++i) staple[i] = Link();
+
+    int cnt = 0;
+#pragma unroll
+    for (int nu = 0; nu < 4; ++nu) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim)
+      if (nu != mu && nu != dir_ignore) {
+        Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu});
+        switch (cnt) {
+        case 0: staple[0] = staple[0] + accum; break;
+        case 1: staple[1] = staple[1] + accum; break;
+        }
+        cnt++;
+      }
+    }
+  }
+
+  template <typename Arg>
+  __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array<int, 4> &dx, int parity,
+                                                        int mu, Matrix<complex<typename Arg::Float>, Arg::nColor> &staple,
+                                                        int dir_ignore)
+  {
+    using Link = Matrix<complex<typename Arg::Float>, Arg::nColor>;
+    staple = Link();
+
+#pragma unroll
+    for (int nu = 0; nu < 4; nu++) {
+      // Identify directions orthogonal to the link and
+      // ignore the dir_ignore direction (usually the temporal dim)
+      if (nu != mu && nu != dir_ignore) {
+#pragma unroll
+        for (int rho = 0; rho < 4; ++rho) {
+          if (rho != mu && rho != nu && rho != dir_ignore) {
+            const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore);
+            const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore);
+
+            staple = staple
+              + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, int2 {rho_with_mu, rho_with_nu}, {mu, nu});
+          }
+        }
+      }
+    }
+  }
+
+  template <typename Arg> struct HYP3D {
+    const Arg &arg;
+    constexpr HYP3D(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity, int dir)
+    {
+      using real = typename Arg::Float;
+      typedef Matrix<complex<real>, Arg::nColor> Link;
+
+      // compute spacetime and local coords
+      int x[4];
+      getCoords(x, x_cb, arg.X, parity);
+#pragma unroll
+      for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
+
+      thread_array<int, 4> dx = {0, 0, 0, 0};
+
+      int dir_ = dir;
+      dir = dir + (dir >= arg.dir_ignore);
+      Link U, Stap[2], TestU, I;
+
+      // Get link U
+      U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity);
+      setIdentity(&I);
+
+      if constexpr (Arg::level == 1) {
+        computeStaple3DLevel1(arg, x, dx, parity, dir, Stap, arg.dir_ignore);
+
+#pragma unroll
+        for (int i = 0; i < 2; ++i) {
+          TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.));
+          polarSu3<real>(TestU, arg.tolerance);
+          arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
+        }
+      } else if constexpr (Arg::level == 2) {
+        computeStaple3DLevel2(arg, x, dx, parity, dir, Stap[0], arg.dir_ignore);
+
+        TestU = I * (static_cast<real>(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.));
+        polarSu3<real>(TestU, arg.tolerance);
+        arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U;
+      }
+    }
+  };
+} // namespace quda
\ No newline at end of file
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 2a512e77e3..f9072a4ee8 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -28,14 +28,16 @@ namespace quda
     const Float rho;
     const Float staple_coeff;
     const Float rectangle_coeff;
+    const int dir_ignore;
 
-    STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon = 0) :
+    STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon, int dir_ignore) :
       kernel_param(dim3(1, 2, stoutDim)),
       out(out),
       in(in),
       rho(rho),
       staple_coeff(rho * (5.0 - 2.0 * epsilon) / 3.0),
-      rectangle_coeff(rho * (1.0 - epsilon) / 12.0)
+      rectangle_coeff(rho * (1.0 - epsilon) / 12.0),
+      dir_ignore(dir_ignore)
     {
       for (int dir = 0; dir < 4; ++dir) {
         border[dir] = in.R()[dir];
@@ -67,11 +69,12 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       Link U, Stap, Q;
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
-      computeStaple(arg, x, X, parity, dir, Stap, Arg::stoutDim);
+      computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndex(x, X), parity);
@@ -133,6 +136,7 @@ namespace quda
         x[dr] += arg.border[dr];
         X[dr] += 2 * arg.border[dr];
       }
+      dir = dir + (dir >= arg.dir_ignore);
 
       Link U, Q;
       ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
@@ -141,7 +145,7 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, Arg::stoutDim);
+      computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, arg.dir_ignore);
 
       // Get link U
       U = arg.in(dir, linkIndex(x, X), parity);
diff --git a/include/quda.h b/include/quda.h
index ff49a2940b..de88d1d0d2 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -846,10 +846,15 @@ extern "C" {
                              Wilson/Symanzik flow */
     double alpha;         /**< The single coefficient used in APE smearing */
     double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */
+    double alpha1;                 /**< The coefficient used in HYP smearing step 3 (will not be used in 3D smearing)*/
+    double alpha2;                 /**< The coefficient used in HYP smearing step 2*/
+    double alpha3;                 /**< The coefficient used in HYP smearing step 1*/
     unsigned int meas_interval;    /**< Perform the requested measurements on the gauge field at this interval */
     QudaGaugeSmearType smear_type; /**< The smearing type to perform */
-    QudaBoolean restart; /**< Used to restart the smearing from existing gaugeSmeared */
-    double t0;  /**< Starting flow time for Wilson flow */
+    QudaBoolean restart;           /**< Used to restart the smearing from existing gaugeSmeared */
+    double t0;                     /**< Starting flow time for Wilson flow */
+    int dir_ignore;                /**< The direction to be ignored by the smearing algorithm
+                                        A negative value means 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP */
   } QudaGaugeSmearParam;
 
   typedef struct QudaBLASParam_s {
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d8c1d8342b..e9b7945ea7 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -27,7 +27,7 @@ set (QUDA_OBJS
   gauge_phase.cu timer.cpp
   solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp
   inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu
-  gauge_stout.cu gauge_wilson_flow.cu gauge_plaq.cu
+  gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu
   gauge_laplace.cpp gauge_observable.cpp
   inv_cg3_quda.cpp inv_ca_gcr.cpp inv_ca_cg.cpp
   inv_gcr_quda.cpp inv_mr_quda.cpp inv_sd_quda.cpp
diff --git a/lib/check_params.h b/lib/check_params.h
index bc88bc8b26..86800b0e7e 100644
--- a/lib/check_params.h
+++ b/lib/check_params.h
@@ -1102,6 +1102,10 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(epsilon, 0.0);
   P(restart, QUDA_BOOLEAN_FALSE);
   P(t0, 0.0);
+  P(alpha1, 0.0);
+  P(alpha2, 0.0);
+  P(alpha3, 0.0);
+  P(dir_ignore, -1);
 #else
   P(n_steps, (unsigned int)INVALID_INT);
   P(meas_interval, (unsigned int)INVALID_INT);
@@ -1110,6 +1114,10 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param)
   P(epsilon, INVALID_DOUBLE);
   P(restart, QUDA_BOOLEAN_INVALID);
   P(t0, INVALID_DOUBLE);
+  P(alpha1, INVALID_DOUBLE);
+  P(alpha2, INVALID_DOUBLE);
+  P(alpha3, INVALID_DOUBLE);
+  P(dir_ignore, INVALID_INT);
 #endif
 
 #ifdef INIT_PARAM
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index a2c8a92dd8..2174e3f9b2 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -8,21 +8,26 @@ namespace quda {
 
   template <typename Float, int nColor, QudaReconstructType recon> class GaugeAPE : TunableKernel3D
   {
-    static constexpr int apeDim = 3; // apply APE in space only
     GaugeField &out;
     const GaugeField &in;
     const Float alpha;
+    const int dir_ignore;
+    const int apeDim;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
     unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
-    GaugeAPE(GaugeField &out, const GaugeField &in, double alpha) :
-      TunableKernel3D(in, 2, apeDim),
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       in(in),
-      alpha(static_cast<Float>(alpha))
+      alpha(static_cast<Float>(alpha)),
+      dir_ignore(dir_ignore),
+      apeDim((dir_ignore == 4) ? 4 : 3)
     {
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
       strcat(aux, comm_dim_partitioned_string());
       apply(device::get_default_stream());
     }
@@ -30,7 +35,11 @@ namespace quda {
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      launch<APE>(tp, stream, GaugeAPEArg<Float,nColor,recon, apeDim>(out, in, alpha));
+      if (apeDim == 3) {
+        launch<APE>(tp, stream, GaugeAPEArg<Float, nColor, recon, 3>(out, in, alpha, dir_ignore));
+      } else if (apeDim == 4) {
+        launch<APE>(tp, stream, GaugeAPEArg<Float, nColor, recon, 4>(out, in, alpha, dir_ignore));
+      }
     }
 
     void preTune() { out.backup(); } // defensive measure in case they alias
@@ -50,16 +59,18 @@ namespace quda {
 
   }; // GaugeAPE
 
-  void APEStep(GaugeField &out, GaugeField& in, double alpha)
+  void APEStep(GaugeField &out, GaugeField &in, double alpha, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeAPE>(out, in, alpha);
+    instantiate<GaugeAPE>(out, in, alpha, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu
new file mode 100644
index 0000000000..4b2c1e3a65
--- /dev/null
+++ b/lib/gauge_hyp.cu
@@ -0,0 +1,148 @@
+#include <quda_internal.h>
+#include <gauge_field.h>
+#include <tunable_nd.h>
+#include <instantiate.h>
+#include <kernels/gauge_hyp.cuh>
+
+namespace quda
+{
+
+  template <typename Float, int nColor, QudaReconstructType recon> class GaugeHYP : TunableKernel3D
+  {
+    GaugeField &out;
+    GaugeField *tmp[4];
+    const GaugeField &in;
+    const Float alpha;
+    const int level;
+    const int dir_ignore;
+    const int hypDim;
+    unsigned int minThreads() const { return in.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
+
+  public:
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
+      out(out),
+      tmp {tmp[0], tmp[1], tmp[2], tmp[3]},
+      in(in),
+      alpha(static_cast<Float>(alpha)),
+      level(level),
+      dir_ignore(dir_ignore),
+      hypDim((dir_ignore == 4) ? 4 : 3)
+    {
+      strcat(aux, ",level=");
+      i32toa(aux + strlen(aux), level);
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
+      strcat(aux, comm_dim_partitioned_string());
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (hypDim == 4) {
+        if (level == 1) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 1, 4>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 2) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 2, 4>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 3) {
+          launch<HYP>(tp, stream, GaugeHYPArg<Float, nColor, recon, 3, 4>(out, tmp, in, alpha, dir_ignore));
+        }
+      } else if (hypDim == 3) {
+        if (level == 1) {
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float, nColor, recon, 1, 3>(out, tmp, in, alpha, dir_ignore));
+        } else if (level == 2) {
+          launch<HYP3D>(tp, stream, GaugeHYPArg<Float, nColor, recon, 2, 3>(out, tmp, in, alpha, dir_ignore));
+        }
+      }
+    }
+
+    void preTune() { out.backup(); } // defensive measure in case they alias
+    void postTune() { out.restore(); }
+
+    long long flops() const
+    {
+      long long flops = 0;
+      auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll);
+      if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) {
+        flops += ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+      } else if (hypDim == 4 && level == 2) {
+        flops += ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume();
+      } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) {
+        flops += (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume();
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      long long bytes = 0;
+      if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp
+        bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+          * hypDim * in.LocalVolume();
+      } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp
+        bytes += (in.Reconstruct() * in.Precision()
+                  + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                  + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision())
+          * hypDim * in.LocalVolume();
+      } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out
+        bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision()
+                  + out.Reconstruct() * out.Precision())
+          * hypDim * in.LocalVolume();
+      }
+      return bytes;
+    }
+
+  }; // GaugeAPE
+
+  void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, int dir_ignore)
+  {
+    checkPrecision(out, in);
+    checkReconstruct(out, in);
+    checkNative(out, in);
+
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
+    GaugeFieldParam gParam(out);
+    gParam.location = QUDA_CUDA_FIELD_LOCATION;
+    gParam.geometry = QUDA_TENSOR_GEOMETRY;
+
+    GaugeField *tmp[4];
+    if (dir_ignore == 4) { // hypDim == 4
+      for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
+    } else { // hypDim == 3
+      tmp[0] = new GaugeField(gParam);
+      // tmp[1], tmp[2] and tmp[3] will not be used for hypDim == 3
+      gParam.create = QUDA_REFERENCE_FIELD_CREATE;
+      for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); }
+    }
+
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    if (dir_ignore == 4) {
+      copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
+      in.exchangeExtendedGhost(in.R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
+      tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false);
+      tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha1, 3, dir_ignore);
+      out.exchangeExtendedGhost(out.R(), false);
+    } else {
+      copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
+      in.exchangeExtendedGhost(in.R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha3, 1, dir_ignore);
+      tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false);
+      instantiate<GaugeHYP>(out, tmp, in, alpha2, 2, dir_ignore);
+      out.exchangeExtendedGhost(out.R(), false);
+    }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+
+    for (int i = 0; i < 4; i++) delete tmp[i];
+  }
+
+} // namespace quda
\ No newline at end of file
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index 041dc6164d..37f8061686 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -12,7 +12,7 @@ namespace quda
       int *num_failures_h = static_cast<int *>(pool_pinned_malloc(sizeof(int)));
       int *num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
       *num_failures_h = 0;
-      auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 1e-14 : 1e-6;
+      auto tol = u.toleranceSU3();
       projectSU3(u, tol, num_failures_d);
       if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
       pool_pinned_free(num_failures_h);
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 48d7e638e8..b3433784ce 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -12,6 +12,7 @@ namespace quda {
     const bool improved;
     const Float rho;
     const Float epsilon;
+    const int dir_ignore;
     const int stoutDim;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
 
@@ -24,17 +25,20 @@ namespace quda {
     }
 
   public:
-    // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
-    GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon = 0.0) :
-      TunableKernel3D(in, 2, improved ? 4 : 3),
+    // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim
+    GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) :
+      TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3),
       out(out),
       in(in),
       improved(improved),
       rho(static_cast<Float>(rho)),
       epsilon(static_cast<Float>(epsilon)),
-      stoutDim(improved ? 4 : 3)
+      dir_ignore(dir_ignore),
+      stoutDim((dir_ignore == 4) ? 4 : 3)
     {
       if (improved) strcat(aux, ",improved");
+      strcat(aux, ",dir_ignore=");
+      i32toa(aux + strlen(aux), dir_ignore);
       strcat(aux, comm_dim_partitioned_string());
       apply(device::get_default_stream());
     }
@@ -43,10 +47,18 @@ namespace quda {
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (!improved) {
-        launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho));
+        if (stoutDim == 3) {
+          launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho, 0.0, dir_ignore));
+        } else if (stoutDim == 4) {
+          launch<STOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, 0.0, dir_ignore));
+        }
       } else if (improved) {
         tp.set_max_shared_bytes = true;
-        launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, epsilon));
+        if (stoutDim == 3) {
+          launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 3>(out, in, rho, epsilon, dir_ignore));
+        } else if (stoutDim == 4) {
+          launch<OvrImpSTOUT>(tp, stream, STOUTArg<Float, nColor, recon, 4>(out, in, rho, epsilon, dir_ignore));
+        }
       }
     }
 
@@ -71,30 +83,34 @@ namespace quda {
               out.Reconstruct() * out.Precision()) * stoutDim * in.LocalVolume();    }
   };
 
-  void STOUTStep(GaugeField &out, GaugeField &in, double rho)
+  void STOUTStep(GaugeField &out, GaugeField &in, double rho, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeSTOUT>(out, in, false, rho);
+    instantiate<GaugeSTOUT>(out, in, false, rho, 0.0, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
-  void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon)
+  void OvrImpSTOUTStep(GaugeField &out, GaugeField &in, double rho, double epsilon, int dir_ignore)
   {
     checkPrecision(out, in);
     checkReconstruct(out, in);
     checkNative(out, in);
 
+    if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; }
+
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
-    instantiate<GaugeSTOUT>(out, in, true, rho, epsilon);
+    instantiate<GaugeSTOUT>(out, in, true, rho, epsilon, dir_ignore);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 3313db26fe..f8c2fcda74 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3744,7 +3744,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     // project onto SU(3) if using the Chroma convention
     if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) {
       *num_failures_h = 0;
-      const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6;
+      const double tol = unitarizedLink.toleranceSU3();
       if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase();
       projectSU3(unitarizedLink, tol, num_failures_d);
       if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied)
@@ -5086,12 +5086,22 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
   gaugeObservablesQuda(&obs_param[measurement_n]);
   logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
 
+  // set default dir_ignore = 3 for APE and STOUT for compatibility
+  int dir_ignore = smear_param->dir_ignore;
+  if (dir_ignore < 0
+      && (smear_param->smear_type == QUDA_GAUGE_SMEAR_APE || smear_param->smear_type == QUDA_GAUGE_SMEAR_STOUT)) {
+    dir_ignore = 3;
+  }
+
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     switch (smear_param->smear_type) {
-    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break;
-    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break;
+    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, dir_ignore); break;
+    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, dir_ignore); break;
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
-      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon);
+      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, dir_ignore);
+      break;
+    case QUDA_GAUGE_SMEAR_HYP:
+      HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp
index a03726f3fb..265fed1248 100644
--- a/tests/su3_test.cpp
+++ b/tests/su3_test.cpp
@@ -22,8 +22,12 @@
 double gauge_smear_rho = 0.1;
 double gauge_smear_epsilon = 0.1;
 double gauge_smear_alpha = 0.6;
+double gauge_smear_alpha1 = 0.75;
+double gauge_smear_alpha2 = 0.6;
+double gauge_smear_alpha3 = 0.3;
 int gauge_smear_steps = 50;
 QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT;
+int gauge_smear_dir_ignore = -1;
 int measurement_interval = 5;
 bool su_project = true;
 
@@ -45,11 +49,17 @@ void display_test_info()
     printfQuda(" - rho %f\n", gauge_smear_rho);
     printfQuda(" - epsilon %f\n", gauge_smear_epsilon);
     break;
+  case QUDA_GAUGE_SMEAR_HYP:
+    printfQuda(" - alpha1 %f\n", gauge_smear_alpha1);
+    printfQuda(" - alpha2 %f\n", gauge_smear_alpha2);
+    printfQuda(" - alpha3 %f\n", gauge_smear_alpha3);
+    break;
   case QUDA_GAUGE_SMEAR_WILSON_FLOW:
   case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break;
   default: errorQuda("Undefined test type %d given", test_type);
   }
   printfQuda(" - smearing steps %d\n", gauge_smear_steps);
+  printfQuda(" - smearing ignore direction %d\n", gauge_smear_dir_ignore);
   printfQuda(" - Measurement interval %d\n", measurement_interval);
 
   printfQuda("Grid partition info:     X  Y  Z  T\n");
@@ -63,6 +73,7 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   CLI::TransformPairs<QudaGaugeSmearType> gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE},
                                                                 {"stout", QUDA_GAUGE_SMEAR_STOUT},
                                                                 {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT},
+                                                                {"hyp", QUDA_GAUGE_SMEAR_HYP},
                                                                 {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW},
                                                                 {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}};
 
@@ -72,7 +83,7 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup
     ->add_option(
       "--su3-smear-type",
-      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, Wilson Flow, Symanzik Flow (default stout)")
+      gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)")
     ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map));
   ;
   opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)");
@@ -83,6 +94,14 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon,
                       "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)");
 
+  opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)");
+  opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)");
+  opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)");
+
+  opgroup->add_option(
+    "--su3-smear-dir-ignore", gauge_smear_dir_ignore,
+    "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)");
+
   opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");
 
   opgroup->add_option("--su3-measurement-interval", measurement_interval,
@@ -251,12 +270,17 @@ int main(int argc, char **argv)
   smear_param.alpha = gauge_smear_alpha;
   smear_param.rho = gauge_smear_rho;
   smear_param.epsilon = gauge_smear_epsilon;
+  smear_param.alpha1 = gauge_smear_alpha1;
+  smear_param.alpha2 = gauge_smear_alpha2;
+  smear_param.alpha3 = gauge_smear_alpha3;
+  smear_param.dir_ignore = gauge_smear_dir_ignore;
 
   host_timer.start(); // start the timer
   switch (smear_param.smear_type) {
   case QUDA_GAUGE_SMEAR_APE:
   case QUDA_GAUGE_SMEAR_STOUT:
-  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: {
+  case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
+  case QUDA_GAUGE_SMEAR_HYP: {
     performGaugeSmearQuda(&smear_param, obs_param);
     break;
   }
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index f2b8f54bcc..196777d574 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -172,6 +172,7 @@ const char *get_gauge_smear_str(QudaGaugeSmearType type)
   case QUDA_GAUGE_SMEAR_APE: ret = "APE"; break;
   case QUDA_GAUGE_SMEAR_STOUT: ret = "Stout"; break;
   case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over Improved"; break;
+  case QUDA_GAUGE_SMEAR_HYP: ret = "HYP"; break;
   case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson Flow"; break;
   case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik Flow"; break;
   default: ret = "unknown"; break;