diff --git a/include/enum_quda.h b/include/enum_quda.h index c4cbb59901..60c186eb77 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -571,6 +571,7 @@ typedef enum QudaGaugeSmearType_s { QUDA_GAUGE_SMEAR_APE, QUDA_GAUGE_SMEAR_STOUT, QUDA_GAUGE_SMEAR_OVRIMP_STOUT, + QUDA_GAUGE_SMEAR_HYP, QUDA_GAUGE_SMEAR_WILSON_FLOW, QUDA_GAUGE_SMEAR_SYMANZIK_FLOW, QUDA_GAUGE_SMEAR_INVALID = QUDA_INVALID_ENUM diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 6a8708948a..ada17e2cc8 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -504,8 +504,9 @@ #define QUDA_GAUGE_SMEAR_APE 0 #define QUDA_GAUGE_SMEAR_STOUT 1 #define QUDA_GAUGE_SMEAR_OVRIMP_STOUT 2 -#define QUDA_GAUGE_SMEAR_WILSON_FLOW 3 -#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 4 +#define QUDA_GAUGE_SMEAR_HYP 3 +#define QUDA_GAUGE_SMEAR_WILSON_FLOW 4 +#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 5 #define QUDA_GAUGE_SMEAR_INVALID QUDA_INVALID_ENUM #define QudaFermionSmearType integer(4) diff --git a/include/gauge_field.h b/include/gauge_field.h index ffea94c22b..a62b6a684f 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -638,6 +638,19 @@ namespace quda { */ static bool are_compatible_weak(const GaugeField &a, const GaugeField &b); + /** + * @brief Helper function that returns the default tolerance used by SU(3) projection + * @return The default tolerance, which is ~10x epsilon + */ + double toleranceSU3() const + { + switch (precision) { + case QUDA_DOUBLE_PRECISION: return 2e-15; + case QUDA_SINGLE_PRECISION: return 1e-6; + default: return 1e-6; + } + } + friend struct GaugeFieldParam; }; diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 9b7d68db37..f6bc3f22b9 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -95,8 +95,9 @@ namespace quda @param[out] dataDs Output smeared field @param[in] dataOr Input gauge field @param[in] alpha smearing parameter + @param[in] dir_ignore ignored direction */ - void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha); + void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha, int dir_ignore); /** @brief Apply STOUT smearing to the gauge field @@ -104,8 +105,9 @@ namespace quda @param[out] dataDs Output smeared field @param[in] dataOr Input gauge field @param[in] rho smearing parameter + @param[in] dir_ignore ignored direction */ - void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho); + void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, int dir_ignore); /** @brief Apply Over Improved STOUT smearing to the gauge field @@ -114,8 +116,20 @@ namespace quda @param[in] dataOr Input gauge field @param[in] rho smearing parameter @param[in] epsilon smearing parameter + @param[in] dir_ignore ignored direction */ - void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon); + void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore); + + /** + @brief Apply HYP smearing to the gauge field + @param[out] dataDs Output smeared field + @param[in] dataOr Input gauge field + @param[in] alpha1 smearing parameter + @param[in] alpha2 smearing parameter + @param[in] alpha3 smearing parameter + @param[in] dir_ignore ignored direction + */ + void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore); /** @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field. diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh index 3bf490dfea..f8e00d03db 100644 --- a/include/kernels/gauge_ape.cuh +++ b/include/kernels/gauge_ape.cuh @@ -8,9 +8,6 @@ namespace quda { -#define DOUBLE_TOL 1e-15 -#define SINGLE_TOL 2e-6 - template struct GaugeAPEArg : kernel_param<> { using Float = Float_; @@ -26,14 +23,16 @@ namespace quda int X[4]; // grid dimensions int border[4]; const Float alpha; + const int dir_ignore; const Float tolerance; - GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha) : + GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) : kernel_param(dim3(in.LocalVolumeCB(), 2, apeDim)), out(out), in(in), alpha(alpha), - tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL) + dir_ignore(dir_ignore), + tolerance(in.toleranceSU3()) { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; @@ -61,16 +60,17 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); int dx[4] = {0, 0, 0, 0}; Link U, Stap, TestU, I; // This function gets stap = S_{mu,nu} i.e., the staple of length 3, - computeStaple(arg, x, X, parity, dir, Stap, Arg::apeDim); + computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndexShift(x, dx, X), parity); - - Stap = Stap * (arg.alpha / ((real)(2. * (3. - 1.)))); + + Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1)))); setIdentity(&I); TestU = I * (static_cast(1.0) - arg.alpha) + Stap * conj(U); diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh new file mode 100644 index 0000000000..c444f86169 --- /dev/null +++ b/include/kernels/gauge_hyp.cuh @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace quda +{ + + template + struct GaugeHYPArg : kernel_param<> { + using Float = Float_; + static constexpr int nColor = nColor_; + static_assert(nColor == 3, "Only nColor=3 enabled at this time"); + static constexpr QudaReconstructType recon = recon_; + static constexpr int hypDim = hypDim_; + static constexpr int level = level_; + typedef typename gauge_mapper::type Gauge; + + Gauge out; + Gauge tmp[4]; + const Gauge in; + + int_fastdiv E[4]; // extended grid dimensions + int_fastdiv X[4]; // grid dimensions + int border[4]; + const Float alpha; + const int dir_ignore; + const Float tolerance; + + GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) : + kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)), + out(out), + tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, + in(in), + alpha(alpha), + dir_ignore(dir_ignore), + tolerance(in.toleranceSU3()) + { + for (int dir = 0; dir < 4; ++dir) { + border[dir] = in.R()[dir]; + E[dir] = in.X()[dir]; + X[dir] = in.X()[dir] - border[dir] * 2; + } + } + }; + + /** + @brief Calculates a staple as part of the HYP calculation, returning the product + + @param[in] arg Kernel argument + @param[in] gauge_mu Gauge/tensor field used for the parallel direction + @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction + @param[in] x Full index array + @param[in,out] dx Full index offset array; leaves the function with the same values it entered with + @param[in] parity Parity index + @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields + @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets + @return The computed staple + */ + template + __host__ __device__ inline Matrix, Arg::nColor> + accumulateStaple(const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], + thread_array &dx, int parity, int2 tensor_arg, int2 shifts) + { + using Link = Matrix, Arg::nColor>; + Link staple; + + // for readability + const int mu = shifts.x; + const int nu = shifts.y; + + /* Computes the upper staple : + * mu (B) + * +-------+ + * nu | | + * (A) | |(C) + * X X + */ + { + /* load matrix A*/ + Link a = gauge_nu(tensor_arg.y, linkIndex(x, arg.E), parity); + + /* load matrix B*/ + dx[nu]++; + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity); + dx[nu]--; + + /* load matrix C*/ + dx[mu]++; + Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity); + dx[mu]--; + + staple = a * b * conj(c); + } + + /* Computes the lower staple : + * X X + * nu | | + * (A) | | (C) + * +-------+ + * mu (B) + */ + { + /* load matrix A*/ + dx[nu]--; + Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity); + + /* load matrix B*/ + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity); + + /* load matrix C*/ + dx[mu]++; + Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), parity); + dx[mu]--; + dx[nu]++; + + staple = staple + conj(a) * b * c; + } + + return staple; + } + + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> staple[3]) + { + using Link = Matrix, Arg::nColor>; + for (int i = 0; i < 3; ++i) staple[i] = Link(); + + int cnt = 0; +#pragma unroll + for (int nu = 0; nu < 4; ++nu) { + if (nu != mu) { + Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + case 2: staple[2] = staple[2] + accum; break; + } + cnt++; + } + } + } + + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> staple[3]) + { + using Link = Matrix, Arg::nColor>; + for (int i = 0; i < 3; ++i) staple[i] = Link(); + + int cnt = 0; +#pragma unroll + for (int nu = 0; nu < 4; nu++) { + if (nu != mu) { +#pragma unroll + for (int rho = 0; rho < 4; ++rho) { + if (rho != mu && rho != nu) { + int sigma = 0; + while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; + + const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); + const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + + Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, + int2 {sigma_with_mu, sigma_with_rho}, {mu, rho}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + case 2: staple[2] = staple[2] + accum; break; + } + } + } + + cnt++; + } + } + } + + template + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> &staple) + { +#pragma unroll + for (int nu = 0; nu < 4; nu++) { + if (nu != mu) { + const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); + const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); + + staple = staple + + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, + int2 {nu_with_mu, mu_with_nu}, {mu, nu}); + } + } + } + + template struct HYP { + const Arg &arg; + constexpr HYP(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity, int dir) + { + using real = typename Arg::Float; + using Link = Matrix, Arg::nColor>; + + // compute spacetime and local coords + int x[4]; + getCoords(x, x_cb, arg.X, parity); +#pragma unroll + for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + + thread_array dx = {0, 0, 0, 0}; + + Link U, Stap[3], TestU, I; + + // Get link U + U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity); + setIdentity(&I); + + if constexpr (Arg::level == 1) { + computeStapleLevel1(arg, x, dx, parity, dir, Stap); + +#pragma unroll + for (int i = 0; i < 3; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; + } + } else if constexpr (Arg::level == 2) { + computeStapleLevel2(arg, x, dx, parity, dir, Stap); + +#pragma unroll + for (int i = 0; i < 3; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; + } + } else if constexpr (Arg::level == 3) { + computeStapleLevel3(arg, x, dx, parity, dir, Stap[0]); + + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); + polarSu3(TestU, arg.tolerance); + arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U; + } + } + }; + + template + __host__ __device__ inline void + computeStaple3DLevel1(const Arg &arg, int x[], thread_array &dx, int parity, int mu, + Matrix, Arg::nColor> staple[2], const int dir_ignore) + { + using Link = Matrix, Arg::nColor>; + for (int i = 0; i < 2; ++i) staple[i] = Link(); + + int cnt = 0; +#pragma unroll + for (int nu = 0; nu < 4; ++nu) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim) + if (nu != mu && nu != dir_ignore) { + Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + } + cnt++; + } + } + } + + template + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> &staple, + int dir_ignore) + { + using Link = Matrix, Arg::nColor>; + staple = Link(); + +#pragma unroll + for (int nu = 0; nu < 4; nu++) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim) + if (nu != mu && nu != dir_ignore) { +#pragma unroll + for (int rho = 0; rho < 4; ++rho) { + if (rho != mu && rho != nu && rho != dir_ignore) { + const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); + const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); + + staple = staple + + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, int2 {rho_with_mu, rho_with_nu}, {mu, nu}); + } + } + } + } + } + + template struct HYP3D { + const Arg &arg; + constexpr HYP3D(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity, int dir) + { + using real = typename Arg::Float; + typedef Matrix, Arg::nColor> Link; + + // compute spacetime and local coords + int x[4]; + getCoords(x, x_cb, arg.X, parity); +#pragma unroll + for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + + thread_array dx = {0, 0, 0, 0}; + + int dir_ = dir; + dir = dir + (dir >= arg.dir_ignore); + Link U, Stap[2], TestU, I; + + // Get link U + U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity); + setIdentity(&I); + + if constexpr (Arg::level == 1) { + computeStaple3DLevel1(arg, x, dx, parity, dir, Stap, arg.dir_ignore); + +#pragma unroll + for (int i = 0; i < 2; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; + } + } else if constexpr (Arg::level == 2) { + computeStaple3DLevel2(arg, x, dx, parity, dir, Stap[0], arg.dir_ignore); + + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.)); + polarSu3(TestU, arg.tolerance); + arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U; + } + } + }; +} // namespace quda \ No newline at end of file diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 2a512e77e3..f9072a4ee8 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -28,14 +28,16 @@ namespace quda const Float rho; const Float staple_coeff; const Float rectangle_coeff; + const int dir_ignore; - STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon = 0) : + STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon, int dir_ignore) : kernel_param(dim3(1, 2, stoutDim)), out(out), in(in), rho(rho), staple_coeff(rho * (5.0 - 2.0 * epsilon) / 3.0), - rectangle_coeff(rho * (1.0 - epsilon) / 12.0) + rectangle_coeff(rho * (1.0 - epsilon) / 12.0), + dir_ignore(dir_ignore) { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; @@ -67,11 +69,12 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); Link U, Stap, Q; // This function gets stap = S_{mu,nu} i.e., the staple of length 3, - computeStaple(arg, x, X, parity, dir, Stap, Arg::stoutDim); + computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndex(x, X), parity); @@ -133,6 +136,7 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); Link U, Q; ThreadLocalCache Stap; @@ -141,7 +145,7 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, Arg::stoutDim); + computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndex(x, X), parity); diff --git a/include/quda.h b/include/quda.h index ff49a2940b..de88d1d0d2 100644 --- a/include/quda.h +++ b/include/quda.h @@ -846,10 +846,15 @@ extern "C" { Wilson/Symanzik flow */ double alpha; /**< The single coefficient used in APE smearing */ double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ + double alpha1; /**< The coefficient used in HYP smearing step 3 (will not be used in 3D smearing)*/ + double alpha2; /**< The coefficient used in HYP smearing step 2*/ + double alpha3; /**< The coefficient used in HYP smearing step 1*/ unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ QudaGaugeSmearType smear_type; /**< The smearing type to perform */ - QudaBoolean restart; /**< Used to restart the smearing from existing gaugeSmeared */ - double t0; /**< Starting flow time for Wilson flow */ + QudaBoolean restart; /**< Used to restart the smearing from existing gaugeSmeared */ + double t0; /**< Starting flow time for Wilson flow */ + int dir_ignore; /**< The direction to be ignored by the smearing algorithm + A negative value means 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP */ } QudaGaugeSmearParam; typedef struct QudaBLASParam_s { diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index d8c1d8342b..e9b7945ea7 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -27,7 +27,7 @@ set (QUDA_OBJS gauge_phase.cu timer.cpp solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu - gauge_stout.cu gauge_wilson_flow.cu gauge_plaq.cu + gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu gauge_laplace.cpp gauge_observable.cpp inv_cg3_quda.cpp inv_ca_gcr.cpp inv_ca_cg.cpp inv_gcr_quda.cpp inv_mr_quda.cpp inv_sd_quda.cpp diff --git a/lib/check_params.h b/lib/check_params.h index bc88bc8b26..86800b0e7e 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -1102,6 +1102,10 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param) P(epsilon, 0.0); P(restart, QUDA_BOOLEAN_FALSE); P(t0, 0.0); + P(alpha1, 0.0); + P(alpha2, 0.0); + P(alpha3, 0.0); + P(dir_ignore, -1); #else P(n_steps, (unsigned int)INVALID_INT); P(meas_interval, (unsigned int)INVALID_INT); @@ -1110,6 +1114,10 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param) P(epsilon, INVALID_DOUBLE); P(restart, QUDA_BOOLEAN_INVALID); P(t0, INVALID_DOUBLE); + P(alpha1, INVALID_DOUBLE); + P(alpha2, INVALID_DOUBLE); + P(alpha3, INVALID_DOUBLE); + P(dir_ignore, INVALID_INT); #endif #ifdef INIT_PARAM diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index a2c8a92dd8..2174e3f9b2 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -8,21 +8,26 @@ namespace quda { template class GaugeAPE : TunableKernel3D { - static constexpr int apeDim = 3; // apply APE in space only GaugeField &out; const GaugeField ∈ const Float alpha; + const int dir_ignore; + const int apeDim; unsigned int minThreads() const { return in.LocalVolumeCB(); } unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: - // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim - GaugeAPE(GaugeField &out, const GaugeField &in, double alpha) : - TunableKernel3D(in, 2, apeDim), + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), in(in), - alpha(static_cast(alpha)) + alpha(static_cast(alpha)), + dir_ignore(dir_ignore), + apeDim((dir_ignore == 4) ? 4 : 3) { + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); strcat(aux, comm_dim_partitioned_string()); apply(device::get_default_stream()); } @@ -30,7 +35,11 @@ namespace quda { void apply(const qudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - launch(tp, stream, GaugeAPEArg(out, in, alpha)); + if (apeDim == 3) { + launch(tp, stream, GaugeAPEArg(out, in, alpha, dir_ignore)); + } else if (apeDim == 4) { + launch(tp, stream, GaugeAPEArg(out, in, alpha, dir_ignore)); + } } void preTune() { out.backup(); } // defensive measure in case they alias @@ -50,16 +59,18 @@ namespace quda { }; // GaugeAPE - void APEStep(GaugeField &out, GaugeField& in, double alpha) + void APEStep(GaugeField &out, GaugeField &in, double alpha, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, alpha); + instantiate(out, in, alpha, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu new file mode 100644 index 0000000000..4b2c1e3a65 --- /dev/null +++ b/lib/gauge_hyp.cu @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include + +namespace quda +{ + + template class GaugeHYP : TunableKernel3D + { + GaugeField &out; + GaugeField *tmp[4]; + const GaugeField ∈ + const Float alpha; + const int level; + const int dir_ignore; + const int hypDim; + unsigned int minThreads() const { return in.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array + + public: + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), + out(out), + tmp {tmp[0], tmp[1], tmp[2], tmp[3]}, + in(in), + alpha(static_cast(alpha)), + level(level), + dir_ignore(dir_ignore), + hypDim((dir_ignore == 4) ? 4 : 3) + { + strcat(aux, ",level="); + i32toa(aux + strlen(aux), level); + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); + strcat(aux, comm_dim_partitioned_string()); + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + if (hypDim == 4) { + if (level == 1) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 2) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 3) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } + } else if (hypDim == 3) { + if (level == 1) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 2) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } + } + } + + void preTune() { out.backup(); } // defensive measure in case they alias + void postTune() { out.restore(); } + + long long flops() const + { + long long flops = 0; + auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll); + if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { + flops += ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + } else if (hypDim == 4 && level == 2) { + flops += ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume(); + } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { + flops += (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + } + return flops; + } + + long long bytes() const + { + long long bytes = 0; + if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp + bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + * hypDim * in.LocalVolume(); + } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp + bytes += (in.Reconstruct() * in.Precision() + + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + * hypDim * in.LocalVolume(); + } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out + bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + out.Reconstruct() * out.Precision()) + * hypDim * in.LocalVolume(); + } + return bytes; + } + + }; // GaugeAPE + + void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, int dir_ignore) + { + checkPrecision(out, in); + checkReconstruct(out, in); + checkNative(out, in); + + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + + GaugeFieldParam gParam(out); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.geometry = QUDA_TENSOR_GEOMETRY; + + GaugeField *tmp[4]; + if (dir_ignore == 4) { // hypDim == 4 + for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } + } else { // hypDim == 3 + tmp[0] = new GaugeField(gParam); + // tmp[1], tmp[2] and tmp[3] will not be used for hypDim == 3 + gParam.create = QUDA_REFERENCE_FIELD_CREATE; + for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } + } + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + if (dir_ignore == 4) { + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); + in.exchangeExtendedGhost(in.R(), false); + instantiate(out, tmp, in, alpha3, 1, dir_ignore); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); + tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false); + instantiate(out, tmp, in, alpha2, 2, dir_ignore); + tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false); + tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false); + instantiate(out, tmp, in, alpha1, 3, dir_ignore); + out.exchangeExtendedGhost(out.R(), false); + } else { + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); + in.exchangeExtendedGhost(in.R(), false); + instantiate(out, tmp, in, alpha3, 1, dir_ignore); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); + instantiate(out, tmp, in, alpha2, 2, dir_ignore); + out.exchangeExtendedGhost(out.R(), false); + } + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + + for (int i = 0; i < 4; i++) delete tmp[i]; + } + +} // namespace quda \ No newline at end of file diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index 041dc6164d..37f8061686 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -12,7 +12,7 @@ namespace quda int *num_failures_h = static_cast(pool_pinned_malloc(sizeof(int))); int *num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); *num_failures_h = 0; - auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 1e-14 : 1e-6; + auto tol = u.toleranceSU3(); projectSU3(u, tol, num_failures_d); if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); pool_pinned_free(num_failures_h); diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index 48d7e638e8..b3433784ce 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -12,6 +12,7 @@ namespace quda { const bool improved; const Float rho; const Float epsilon; + const int dir_ignore; const int stoutDim; unsigned int minThreads() const { return in.LocalVolumeCB(); } @@ -24,17 +25,20 @@ namespace quda { } public: - // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim - GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon = 0.0) : - TunableKernel3D(in, 2, improved ? 4 : 3), + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), in(in), improved(improved), rho(static_cast(rho)), epsilon(static_cast(epsilon)), - stoutDim(improved ? 4 : 3) + dir_ignore(dir_ignore), + stoutDim((dir_ignore == 4) ? 4 : 3) { if (improved) strcat(aux, ",improved"); + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); strcat(aux, comm_dim_partitioned_string()); apply(device::get_default_stream()); } @@ -43,10 +47,18 @@ namespace quda { { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (!improved) { - launch(tp, stream, STOUTArg(out, in, rho)); + if (stoutDim == 3) { + launch(tp, stream, STOUTArg(out, in, rho, 0.0, dir_ignore)); + } else if (stoutDim == 4) { + launch(tp, stream, STOUTArg(out, in, rho, 0.0, dir_ignore)); + } } else if (improved) { tp.set_max_shared_bytes = true; - launch(tp, stream, STOUTArg(out, in, rho, epsilon)); + if (stoutDim == 3) { + launch(tp, stream, STOUTArg(out, in, rho, epsilon, dir_ignore)); + } else if (stoutDim == 4) { + launch(tp, stream, STOUTArg(out, in, rho, epsilon, dir_ignore)); + } } } @@ -71,30 +83,34 @@ namespace quda { out.Reconstruct() * out.Precision()) * stoutDim * in.LocalVolume(); } }; - void STOUTStep(GaugeField &out, GaugeField &in, double rho) + void STOUTStep(GaugeField &out, GaugeField &in, double rho, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, false, rho); + instantiate(out, in, false, rho, 0.0, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } - void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon) + void OvrImpSTOUTStep(GaugeField &out, GaugeField &in, double rho, double epsilon, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, true, rho, epsilon); + instantiate(out, in, true, rho, epsilon, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 3313db26fe..f8c2fcda74 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3744,7 +3744,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, // project onto SU(3) if using the Chroma convention if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) { *num_failures_h = 0; - const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6; + const double tol = unitarizedLink.toleranceSU3(); if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase(); projectSU3(unitarizedLink, tol, num_failures_d); if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied) @@ -5086,12 +5086,22 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable gaugeObservablesQuda(&obs_param[measurement_n]); logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); + // set default dir_ignore = 3 for APE and STOUT for compatibility + int dir_ignore = smear_param->dir_ignore; + if (dir_ignore < 0 + && (smear_param->smear_type == QUDA_GAUGE_SMEAR_APE || smear_param->smear_type == QUDA_GAUGE_SMEAR_STOUT)) { + dir_ignore = 3; + } + for (unsigned int i = 0; i < smear_param->n_steps; i++) { switch (smear_param->smear_type) { - case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break; - case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break; + case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, dir_ignore); break; + case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, dir_ignore); break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: - OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon); + OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, dir_ignore); + break; + case QUDA_GAUGE_SMEAR_HYP: + HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index a03726f3fb..265fed1248 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -22,8 +22,12 @@ double gauge_smear_rho = 0.1; double gauge_smear_epsilon = 0.1; double gauge_smear_alpha = 0.6; +double gauge_smear_alpha1 = 0.75; +double gauge_smear_alpha2 = 0.6; +double gauge_smear_alpha3 = 0.3; int gauge_smear_steps = 50; QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; +int gauge_smear_dir_ignore = -1; int measurement_interval = 5; bool su_project = true; @@ -45,11 +49,17 @@ void display_test_info() printfQuda(" - rho %f\n", gauge_smear_rho); printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break; + case QUDA_GAUGE_SMEAR_HYP: + printfQuda(" - alpha1 %f\n", gauge_smear_alpha1); + printfQuda(" - alpha2 %f\n", gauge_smear_alpha2); + printfQuda(" - alpha3 %f\n", gauge_smear_alpha3); + break; case QUDA_GAUGE_SMEAR_WILSON_FLOW: case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break; default: errorQuda("Undefined test type %d given", test_type); } printfQuda(" - smearing steps %d\n", gauge_smear_steps); + printfQuda(" - smearing ignore direction %d\n", gauge_smear_dir_ignore); printfQuda(" - Measurement interval %d\n", measurement_interval); printfQuda("Grid partition info: X Y Z T\n"); @@ -63,6 +73,7 @@ void add_su3_option_group(std::shared_ptr quda_app) CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, + {"hyp", QUDA_GAUGE_SMEAR_HYP}, {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; @@ -72,7 +83,7 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup ->add_option( "--su3-smear-type", - gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, Wilson Flow, Symanzik Flow (default stout)") + gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); ; opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); @@ -83,6 +94,14 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon, "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)"); + opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); + opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); + opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); + + opgroup->add_option( + "--su3-smear-dir-ignore", gauge_smear_dir_ignore, + "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); + opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)"); opgroup->add_option("--su3-measurement-interval", measurement_interval, @@ -251,12 +270,17 @@ int main(int argc, char **argv) smear_param.alpha = gauge_smear_alpha; smear_param.rho = gauge_smear_rho; smear_param.epsilon = gauge_smear_epsilon; + smear_param.alpha1 = gauge_smear_alpha1; + smear_param.alpha2 = gauge_smear_alpha2; + smear_param.alpha3 = gauge_smear_alpha3; + smear_param.dir_ignore = gauge_smear_dir_ignore; host_timer.start(); // start the timer switch (smear_param.smear_type) { case QUDA_GAUGE_SMEAR_APE: case QUDA_GAUGE_SMEAR_STOUT: - case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: { + case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: + case QUDA_GAUGE_SMEAR_HYP: { performGaugeSmearQuda(&smear_param, obs_param); break; } diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index f2b8f54bcc..196777d574 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -172,6 +172,7 @@ const char *get_gauge_smear_str(QudaGaugeSmearType type) case QUDA_GAUGE_SMEAR_APE: ret = "APE"; break; case QUDA_GAUGE_SMEAR_STOUT: ret = "Stout"; break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over Improved"; break; + case QUDA_GAUGE_SMEAR_HYP: ret = "HYP"; break; case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson Flow"; break; case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik Flow"; break; default: ret = "unknown"; break;