From bc889990f667280e19402d020c956bac0bc9f6fb Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Mon, 8 Jan 2024 01:16:14 +0800 Subject: [PATCH 01/22] Add `dir_ignore` to `QudaGaugeSmearParam`. --- include/gauge_tools.h | 9 ++++++--- include/kernels/gauge_ape.cuh | 9 ++++++--- include/kernels/gauge_stout.cuh | 12 +++++++---- include/quda.h | 1 + lib/gauge_ape.cu | 27 +++++++++++++++++-------- lib/gauge_stout.cu | 36 ++++++++++++++++++++++++--------- lib/interface_quda.cpp | 6 +++--- 7 files changed, 69 insertions(+), 31 deletions(-) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 9b7d68db37..80d3a4f12b 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -95,8 +95,9 @@ namespace quda @param[out] dataDs Output smeared field @param[in] dataOr Input gauge field @param[in] alpha smearing parameter + @param[in] dir_ignore ignored direction */ - void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha); + void APEStep(GaugeField &dataDs, GaugeField &dataOr, double alpha, int dir_ignore); /** @brief Apply STOUT smearing to the gauge field @@ -104,8 +105,9 @@ namespace quda @param[out] dataDs Output smeared field @param[in] dataOr Input gauge field @param[in] rho smearing parameter + @param[in] dir_ignore ignored direction */ - void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho); + void STOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, int dir_ignore); /** @brief Apply Over Improved STOUT smearing to the gauge field @@ -114,8 +116,9 @@ namespace quda @param[in] dataOr Input gauge field @param[in] rho smearing parameter @param[in] epsilon smearing parameter + @param[in] dir_ignore ignored direction */ - void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon); + void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore); /** @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field. diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh index 3bf490dfea..226f426f73 100644 --- a/include/kernels/gauge_ape.cuh +++ b/include/kernels/gauge_ape.cuh @@ -26,13 +26,15 @@ namespace quda int X[4]; // grid dimensions int border[4]; const Float alpha; + const int dir_ignore; const Float tolerance; - GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha) : + GaugeAPEArg(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) : kernel_param(dim3(in.LocalVolumeCB(), 2, apeDim)), out(out), in(in), alpha(alpha), + dir_ignore(dir_ignore), tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL) { for (int dir = 0; dir < 4; ++dir) { @@ -61,16 +63,17 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); int dx[4] = {0, 0, 0, 0}; Link U, Stap, TestU, I; // This function gets stap = S_{mu,nu} i.e., the staple of length 3, - computeStaple(arg, x, X, parity, dir, Stap, Arg::apeDim); + computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndexShift(x, dx, X), parity); - Stap = Stap * (arg.alpha / ((real)(2. * (3. - 1.)))); + Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1)))); setIdentity(&I); TestU = I * (static_cast(1.0) - arg.alpha) + Stap * conj(U); diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh index 2a512e77e3..f9072a4ee8 100644 --- a/include/kernels/gauge_stout.cuh +++ b/include/kernels/gauge_stout.cuh @@ -28,14 +28,16 @@ namespace quda const Float rho; const Float staple_coeff; const Float rectangle_coeff; + const int dir_ignore; - STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon = 0) : + STOUTArg(GaugeField &out, const GaugeField &in, Float rho, Float epsilon, int dir_ignore) : kernel_param(dim3(1, 2, stoutDim)), out(out), in(in), rho(rho), staple_coeff(rho * (5.0 - 2.0 * epsilon) / 3.0), - rectangle_coeff(rho * (1.0 - epsilon) / 12.0) + rectangle_coeff(rho * (1.0 - epsilon) / 12.0), + dir_ignore(dir_ignore) { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; @@ -67,11 +69,12 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); Link U, Stap, Q; // This function gets stap = S_{mu,nu} i.e., the staple of length 3, - computeStaple(arg, x, X, parity, dir, Stap, Arg::stoutDim); + computeStaple(arg, x, X, parity, dir, Stap, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndex(x, X), parity); @@ -133,6 +136,7 @@ namespace quda x[dr] += arg.border[dr]; X[dr] += 2 * arg.border[dr]; } + dir = dir + (dir >= arg.dir_ignore); Link U, Q; ThreadLocalCache Stap; @@ -141,7 +145,7 @@ namespace quda // This function gets stap = S_{mu,nu} i.e., the staple of length 3, // and the 1x2 and 2x1 rectangles of length 5. From the following paper: // https://arxiv.org/abs/0801.1165 - computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, Arg::stoutDim); + computeStapleRectangle(arg, x, X, parity, dir, Stap, Rect, arg.dir_ignore); // Get link U U = arg.in(dir, linkIndex(x, X), parity); diff --git a/include/quda.h b/include/quda.h index d11fd4cbd0..ed586e1b54 100644 --- a/include/quda.h +++ b/include/quda.h @@ -848,6 +848,7 @@ extern "C" { double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ QudaGaugeSmearType smear_type; /**< The smearing type to perform */ + int dir_ignore; /**< The direction to be ignored by the smearing algorithm */ } QudaGaugeSmearParam; typedef struct QudaBLASParam_s { diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index a2c8a92dd8..3f51ef566a 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -8,21 +8,26 @@ namespace quda { template class GaugeAPE : TunableKernel3D { - static constexpr int apeDim = 3; // apply APE in space only GaugeField &out; const GaugeField ∈ const Float alpha; + const int dir_ignore; + const int apeDim; unsigned int minThreads() const { return in.LocalVolumeCB(); } unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: - // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim - GaugeAPE(GaugeField &out, const GaugeField &in, double alpha) : - TunableKernel3D(in, 2, apeDim), + // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), in(in), - alpha(static_cast(alpha)) + alpha(static_cast(alpha)), + dir_ignore(dir_ignore), + apeDim((dir_ignore == 4) ? 4 : 3) { + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); strcat(aux, comm_dim_partitioned_string()); apply(device::get_default_stream()); } @@ -30,7 +35,11 @@ namespace quda { void apply(const qudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - launch(tp, stream, GaugeAPEArg(out, in, alpha)); + if (apeDim == 3) { + launch(tp, stream, GaugeAPEArg(out, in, alpha, dir_ignore)); + } else if (apeDim == 4) { + launch(tp, stream, GaugeAPEArg(out, in, alpha, dir_ignore)); + } } void preTune() { out.backup(); } // defensive measure in case they alias @@ -50,16 +59,18 @@ namespace quda { }; // GaugeAPE - void APEStep(GaugeField &out, GaugeField& in, double alpha) + void APEStep(GaugeField &out, GaugeField& in, double alpha, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, alpha); + instantiate(out, in, alpha, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index 48d7e638e8..2a234d861b 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -12,6 +12,7 @@ namespace quda { const bool improved; const Float rho; const Float epsilon; + const int dir_ignore; const int stoutDim; unsigned int minThreads() const { return in.LocalVolumeCB(); } @@ -24,17 +25,20 @@ namespace quda { } public: - // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim - GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon = 0.0) : - TunableKernel3D(in, 2, improved ? 4 : 3), + // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), in(in), improved(improved), rho(static_cast(rho)), epsilon(static_cast(epsilon)), - stoutDim(improved ? 4 : 3) + dir_ignore(dir_ignore), + stoutDim((dir_ignore == 4) ? 4 : 3) { if (improved) strcat(aux, ",improved"); + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); strcat(aux, comm_dim_partitioned_string()); apply(device::get_default_stream()); } @@ -43,10 +47,18 @@ namespace quda { { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (!improved) { - launch(tp, stream, STOUTArg(out, in, rho)); + if (stoutDim == 3) { + launch(tp, stream, STOUTArg(out, in, rho, 0.0, dir_ignore)); + } else if (stoutDim == 4) { + launch(tp, stream, STOUTArg(out, in, rho, 0.0, dir_ignore)); + } } else if (improved) { tp.set_max_shared_bytes = true; - launch(tp, stream, STOUTArg(out, in, rho, epsilon)); + if (stoutDim == 3) { + launch(tp, stream, STOUTArg(out, in, rho, epsilon, dir_ignore)); + } else if (stoutDim == 4) { + launch(tp, stream, STOUTArg(out, in, rho, epsilon, dir_ignore)); + } } } @@ -71,30 +83,34 @@ namespace quda { out.Reconstruct() * out.Precision()) * stoutDim * in.LocalVolume(); } }; - void STOUTStep(GaugeField &out, GaugeField &in, double rho) + void STOUTStep(GaugeField &out, GaugeField &in, double rho, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, false, rho); + instantiate(out, in, false, rho, 0.0, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } - void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon) + void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - instantiate(out, in, true, rho, epsilon); + instantiate(out, in, true, rho, epsilon, dir_ignore); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index b07a0b2c7b..4d655945ab 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5090,10 +5090,10 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable for (unsigned int i = 0; i < smear_param->n_steps; i++) { switch (smear_param->smear_type) { - case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break; - case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break; + case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, smear_param->dir_ignore); break; + case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->dir_ignore); break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: - OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon); + OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } From 4aaa49a9ece59af928fa406de946aec0d3fc896e Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Mon, 8 Jan 2024 20:46:21 +0800 Subject: [PATCH 02/22] Add HYP smearing. --- include/enum_quda.h | 1 + include/enum_quda_fortran.h | 5 +- include/gauge_tools.h | 12 + include/kernels/gauge_hyp.cuh | 445 ++++++++++++++++++++++++++++++++++ include/quda.h | 3 + lib/CMakeLists.txt | 2 +- lib/check_params.h | 8 + lib/gauge_hyp.cu | 107 ++++++++ lib/interface_quda.cpp | 21 ++ tests/su3_test.cpp | 26 +- tests/utils/misc.cpp | 1 + 11 files changed, 626 insertions(+), 5 deletions(-) create mode 100644 include/kernels/gauge_hyp.cuh create mode 100644 lib/gauge_hyp.cu diff --git a/include/enum_quda.h b/include/enum_quda.h index c4cbb59901..60c186eb77 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -571,6 +571,7 @@ typedef enum QudaGaugeSmearType_s { QUDA_GAUGE_SMEAR_APE, QUDA_GAUGE_SMEAR_STOUT, QUDA_GAUGE_SMEAR_OVRIMP_STOUT, + QUDA_GAUGE_SMEAR_HYP, QUDA_GAUGE_SMEAR_WILSON_FLOW, QUDA_GAUGE_SMEAR_SYMANZIK_FLOW, QUDA_GAUGE_SMEAR_INVALID = QUDA_INVALID_ENUM diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 6a8708948a..ada17e2cc8 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -504,8 +504,9 @@ #define QUDA_GAUGE_SMEAR_APE 0 #define QUDA_GAUGE_SMEAR_STOUT 1 #define QUDA_GAUGE_SMEAR_OVRIMP_STOUT 2 -#define QUDA_GAUGE_SMEAR_WILSON_FLOW 3 -#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 4 +#define QUDA_GAUGE_SMEAR_HYP 3 +#define QUDA_GAUGE_SMEAR_WILSON_FLOW 4 +#define QUDA_GAUGE_SMEAR_SYMANZIK_FLOW 5 #define QUDA_GAUGE_SMEAR_INVALID QUDA_INVALID_ENUM #define QudaFermionSmearType integer(4) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 80d3a4f12b..46a84fed83 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -120,6 +120,18 @@ namespace quda */ void OvrImpSTOUTStep(GaugeField &dataDs, GaugeField &dataOr, double rho, double epsilon, int dir_ignore); + /** + @brief Apply HYP smearing to the gauge field + @param[out] dataDs Output smeared field + @param[in] dataTemp Temp space + @param[in] dataOr Input gauge field + @param[in] alpha1 smearing parameter + @param[in] alpha2 smearing parameter + @param[in] alpha3 smearing parameter + @param[in] dir_ignore ignored direction + */ + void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore); + /** @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field. This routine assumes that the input and output fields are diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh new file mode 100644 index 0000000000..6184a31fa9 --- /dev/null +++ b/include/kernels/gauge_hyp.cuh @@ -0,0 +1,445 @@ +#include +#include +#include +#include +#include +#include + +namespace quda +{ + +#define DOUBLE_TOL 1e-15 +#define SINGLE_TOL 2e-6 + + template + struct GaugeHYPArg : kernel_param<> { + using Float = Float_; + static constexpr int nColor = nColor_; + static_assert(nColor == 3, "Only nColor=3 enabled at this time"); + static constexpr QudaReconstructType recon = recon_; + static constexpr int hypDim = hypDim_; + static constexpr int level = level_; + typedef typename gauge_mapper::type Gauge; + + Gauge out; + Gauge tmp[4]; + const Gauge in; + + int X[4]; // grid dimensions + int border[4]; + const Float alpha; + const int dir_ignore; + const Float tolerance; + + GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) : + kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)), + out(out), + tmp{*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, + in(in), + alpha(alpha), + dir_ignore(dir_ignore), + tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL) + { + for (int dir = 0; dir < 4; ++dir) { + border[dir] = in.R()[dir]; + X[dir] = in.X()[dir] - border[dir] * 2; + } + } + }; + + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) + { + using Link = typename get_type::type; + for (int i = 0; i < 3; ++i) staple[i] = Link(); + + thread_array dx = { }; + int cnt = -1; +#pragma unroll + for (int nu = 0; nu < 4; ++nu) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + if (nu == mu) continue; + + cnt += 1; + + { + // Get link U_{\nu}(x) + Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity); + + // Get link U_{\mu}(x+\nu) + dx[nu]++; + Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + dx[nu]--; + + // Get link U_{\nu}(x+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + dx[mu]--; + + // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) + staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + } + + { + // Get link U_{\nu}(x-\nu) + dx[nu]--; + Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + // Get link U_{\mu}(x-\nu) + Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + + // Get link U_{\nu}(x-\nu+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity); + + // reset dx + dx[mu]--; + dx[nu]++; + + // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) + staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + } + } + } + + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) + { + using Link = typename get_type::type; + for (int i = 0; i < 3; ++i) staple[i] = Link(); + + thread_array dx = { }; + int cnt = -1; +#pragma unroll + for (int nu = 0; nu < 4; nu++) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + if (nu == mu) continue; + + cnt += 1; + + for (int rho = 0; rho < 4; ++rho) { + if (rho == mu || rho == nu) continue; + int sigma = 0; + while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; + + const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); + const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + + { + // Get link U_{\rho}(x) + Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity); + + // Get link U_{\mu}(x+\rho) + dx[rho]++; + Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity); + dx[rho]--; + + // Get link U_{\rho}(x+\mu) + dx[mu]++; + Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity); + dx[mu]--; + + // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu) + staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + } + + { + // Get link U_{\rho}(x-\rho) + dx[rho]--; + Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity); + // Get link U_{\mu}(x-\rho) + Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity); + + // Get link U_{\rho}(x-\rho+\mu) + dx[mu]++; + Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity); + + // reset dx + dx[mu]--; + dx[rho]++; + + // staple += U^\dag_{\rho}(x-\rho) * U_{\mu}(x-\rho) * U_{\rho}(x-\rho+\mu) + staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + } + } + } + } + + template + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple &staple) + { + using Link = typename get_type::type; + staple = Link(); + + thread_array dx = { }; +#pragma unroll + for (int nu = 0; nu < 4 ; nu++) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + if (nu == mu) continue; + + const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); + const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); + + { + // Get link U_{\nu}(x) + Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity); + + // Get link U_{\mu}(x+\nu) + dx[nu]++; + Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity); + dx[nu]--; + + // Get link U_{\nu}(x+\mu) + dx[mu]++; + Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity); + dx[mu]--; + + // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) + staple = staple + U1 * U2 * conj(U3); + } + + { + // Get link U_{\nu}(x-\nu) + dx[nu]--; + Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity); + // Get link U_{\mu}(x-\nu) + Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity); + + // Get link U_{\nu}(x-\nu+\mu) + dx[mu]++; + Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity); + + // reset dx + dx[mu]--; + dx[nu]++; + + // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) + staple = staple + conj(U1) * U2 * U3; + } + } + } + + template struct HYP { + const Arg &arg; + constexpr HYP(const Arg &arg) : arg(arg) {} + static constexpr const char* filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity, int dir) + { + using real = typename Arg::Float; + typedef Matrix, Arg::nColor> Link; + + // compute spacetime and local coords + int X[4]; + for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr]; + int x[4]; + getCoords(x, x_cb, X, parity); + for (int dr = 0; dr < 4; ++dr) { + x[dr] += arg.border[dr]; + X[dr] += 2 * arg.border[dr]; + } + + int dx[4] = {0, 0, 0, 0}; + Link U, Stap[3], TestU, I; + + // Get link U + U = arg.in(dir, linkIndexShift(x, dx, X), parity); + setIdentity(&I); + + if constexpr (Arg::level == 1) { + computeStapleLevel1(arg, x, X, parity, dir, Stap); + + for (int i = 0; i < 3; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + } + } else if constexpr (Arg::level == 2) { + computeStapleLevel2(arg, x, X, parity, dir, Stap); + + for (int i = 0; i < 3; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + } + } else if constexpr (Arg::level == 3) { + computeStapleLevel3(arg, x, X, parity, dir, Stap[0]); + + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); + polarSu3(TestU, arg.tolerance); + arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U; + } + } + }; + + template + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], const int dir_ignore) + { + using Link = typename get_type::type; + for (int i = 0; i < 2; ++i) staple[i] = Link(); + + thread_array dx = { }; + int cnt = -1; +#pragma unroll + for (int nu = 0; nu < 4; ++nu) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + if (nu == mu || nu == dir_ignore) continue; + + cnt += 1; + + { + // Get link U_{\nu}(x) + Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity); + + // Get link U_{\mu}(x+\nu) + dx[nu]++; + Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + dx[nu]--; + + // Get link U_{\nu}(x+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + dx[mu]--; + + // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) + staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + } + + { + // Get link U_{\nu}(x-\nu) + dx[nu]--; + Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + // Get link U_{\mu}(x-\nu) + Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + + // Get link U_{\nu}(x-\nu+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity); + + // reset dx + dx[mu]--; + dx[nu]++; + + // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) + staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + } + } + } + + template + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore) + { + using Link = typename get_type::type; + for (int i = 0; i < 1; ++i) staple[i] = Link(); + + thread_array dx = { }; + int cnt = -1; +#pragma unroll + for (int nu = 0; nu < 4; nu++) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + if (nu == mu || nu == dir_ignore) continue; + + cnt += 1; + + for (int rho = 0; rho < 4; ++rho) { + if (rho == mu || rho == nu || rho == dir_ignore) continue; + + const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); + const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); + + { + // Get link U_{\nu}(x) + Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity); + + // Get link U_{\mu}(x+\nu) + dx[nu]++; + Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity); + dx[nu]--; + + // Get link U_{\nu}(x+\mu) + dx[mu]++; + Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity); + dx[mu]--; + + // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) + staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + } + + { + // Get link U_{\nu}(x-\nu) + dx[nu]--; + Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity); + // Get link U_{\mu}(x-\nu) + Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity); + + // Get link U_{\nu}(x-\nu+\mu) + dx[mu]++; + Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity); + + // reset dx + dx[mu]--; + dx[rho]++; + + // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) + staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + } + } + } + } + + template struct HYP3D { + const Arg &arg; + constexpr HYP3D(const Arg &arg) : arg(arg) {} + static constexpr const char* filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity, int dir) + { + using real = typename Arg::Float; + typedef Matrix, Arg::nColor> Link; + + // compute spacetime and local coords + int X[4]; + for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr]; + int x[4]; + getCoords(x, x_cb, X, parity); + for (int dr = 0; dr < 4; ++dr) { + x[dr] += arg.border[dr]; + X[dr] += 2 * arg.border[dr]; + } + int dir_ = dir; + dir = dir + (dir >= arg.dir_ignore); + + int dx[4] = {0, 0, 0, 0}; + Link U, Stap[2], TestU, I; + + // Get link U + U = arg.in(dir, linkIndexShift(x, dx, X), parity); + setIdentity(&I); + + if constexpr (Arg::level == 1) { + computeStaple3DLevel1(arg, x, X, parity, dir, Stap, arg.dir_ignore); + + for (int i = 0; i < 2; ++i) { + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); + polarSu3(TestU, arg.tolerance); + arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + } + } else if constexpr (Arg::level == 2) { + computeStaple3DLevel2(arg, x, X, parity, dir, Stap, arg.dir_ignore); + + TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.)); + polarSu3(TestU, arg.tolerance); + arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U; + } + } + }; +} // namespace quda \ No newline at end of file diff --git a/include/quda.h b/include/quda.h index ed586e1b54..ccef6e3d80 100644 --- a/include/quda.h +++ b/include/quda.h @@ -846,6 +846,9 @@ extern "C" { Wilson/Symanzik flow */ double alpha; /**< The single coefficient used in APE smearing */ double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ + double alpha1; /**< The first coefficient used in HYP smearing */ + double alpha2; /**< The second coefficient used in HYP smearing */ + double alpha3; /**< The third coefficient used in HYP smearing */ unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ QudaGaugeSmearType smear_type; /**< The smearing type to perform */ int dir_ignore; /**< The direction to be ignored by the smearing algorithm */ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index d8c1d8342b..e9b7945ea7 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -27,7 +27,7 @@ set (QUDA_OBJS gauge_phase.cu timer.cpp solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu - gauge_stout.cu gauge_wilson_flow.cu gauge_plaq.cu + gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu gauge_laplace.cpp gauge_observable.cpp inv_cg3_quda.cpp inv_ca_gcr.cpp inv_ca_cg.cpp inv_gcr_quda.cpp inv_mr_quda.cpp inv_sd_quda.cpp diff --git a/lib/check_params.h b/lib/check_params.h index 00892741f2..06247d0306 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -1100,12 +1100,20 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param) P(alpha, 0.0); P(rho, 0.0); P(epsilon, 0.0); + P(alpha1, 0.0); + P(alpha2, 0.0); + P(alpha3, 0.0); + P(dir_ignore, 4); #else P(n_steps, (unsigned int)INVALID_INT); P(meas_interval, (unsigned int)INVALID_INT); P(alpha, INVALID_DOUBLE); P(rho, INVALID_DOUBLE); P(epsilon, INVALID_DOUBLE); + P(alpha1, INVALID_DOUBLE); + P(alpha2, INVALID_DOUBLE); + P(alpha3, INVALID_DOUBLE); + P(dir_ignore, INVALID_INT); #endif #ifdef INIT_PARAM diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu new file mode 100644 index 0000000000..7c587a97e1 --- /dev/null +++ b/lib/gauge_hyp.cu @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include + +namespace quda { + + template class GaugeHYP : TunableKernel3D + { + GaugeField &out; + GaugeField *tmp[4]; + const GaugeField ∈ + const Float alpha; + const int level; + const int dir_ignore; + const int hypDim; + unsigned int minThreads() const { return in.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array + + public: + // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : + TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), + out(out), + tmp{tmp[0], tmp[1], tmp[2], tmp[3]}, + in(in), + alpha(static_cast(alpha)), + level(level), + dir_ignore(dir_ignore), + hypDim((dir_ignore == 4) ? 4 : 3) + { + strcat(aux, ",level="); + i32toa(aux + strlen(aux), level); + strcat(aux, ",dir_ignore="); + i32toa(aux + strlen(aux), dir_ignore); + strcat(aux, comm_dim_partitioned_string()); + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + if (hypDim == 4) { + if (level == 1) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 2) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 3) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } + } else if (hypDim == 3) { + if (level == 1) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } else if (level == 2) { + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + } + } + } + + void preTune() { out.backup(); } // defensive measure in case they alias + void postTune() { out.restore(); } + + long long flops() const + { + auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll); + return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + } + + long long bytes() const // 6 links per dim, 1 in, 1 out. + { + return ((1 + (hypDim - 1) * 6) * in.Reconstruct() * in.Precision() + + out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume(); + } + + }; // GaugeAPE + + void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField& in, double alpha1, double alpha2, double alpha3, int dir_ignore) + { + checkPrecision(out, in); + checkReconstruct(out, in); + checkNative(out, in); + + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + + if (dir_ignore == 4) { + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); + in.exchangeExtendedGhost(in.R(), false); + instantiate(out, tmp, in, alpha3, 1, dir_ignore); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); + tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false); + instantiate(out, tmp, in, alpha2, 2, dir_ignore); + tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false); + tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false); + instantiate(out, tmp, in, alpha1, 3, dir_ignore); + out.exchangeExtendedGhost(out.R(), false); + } else { + copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); + in.exchangeExtendedGhost(in.R(), false); + instantiate(out, tmp, in, alpha3, 1, dir_ignore); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); + instantiate(out, tmp, in, alpha2, 2, dir_ignore); + out.exchangeExtendedGhost(out.R(), false); + } + } + +} \ No newline at end of file diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 4d655945ab..cc8ffa6254 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5084,6 +5084,20 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable gParam.location = QUDA_CUDA_FIELD_LOCATION; GaugeField tmp(gParam); + const int smearDim = (smear_param->dir_ignore >= 0 && smear_param->dir_ignore <= 3) ? 3 : 4; + GaugeField *aux[4]; + if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) { + gParam.geometry = QUDA_TENSOR_GEOMETRY; + if (smearDim == 3) { + aux[0] = new GaugeField(gParam); + // aux[1], aux[2] and aux[3] will not be used for smearDim=3 + gParam.create = QUDA_REFERENCE_FIELD_CREATE; + for (int i = 1; i < 4; ++i) { aux[i] = new GaugeField(gParam); } + } else { + for (int i = 0; i < 4; ++i) { aux[i] = new GaugeField(gParam); } + } + } + int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); @@ -5095,6 +5109,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); break; + case QUDA_GAUGE_SMEAR_HYP: + HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore); + break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } @@ -5105,6 +5122,10 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable } } + if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) { + for (int i = 0; i < 4; ++i) { delete aux[i]; } + } + popOutputPrefix(); } diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index a03726f3fb..3adc58e6d7 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -22,8 +22,12 @@ double gauge_smear_rho = 0.1; double gauge_smear_epsilon = 0.1; double gauge_smear_alpha = 0.6; +double gauge_smear_alpha1 = 0.75; +double gauge_smear_alpha2 = 0.6; +double gauge_smear_alpha3 = 0.3; int gauge_smear_steps = 50; QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; +int gauge_smear_dir_ignore = 4; int measurement_interval = 5; bool su_project = true; @@ -45,11 +49,17 @@ void display_test_info() printfQuda(" - rho %f\n", gauge_smear_rho); printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break; + case QUDA_GAUGE_SMEAR_HYP: + printfQuda(" - alpha1 %f\n", gauge_smear_alpha1); + printfQuda(" - alpha2 %f\n", gauge_smear_alpha2); + printfQuda(" - alpha3 %f\n", gauge_smear_alpha3); + break; case QUDA_GAUGE_SMEAR_WILSON_FLOW: case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: printfQuda(" - epsilon %f\n", gauge_smear_epsilon); break; default: errorQuda("Undefined test type %d given", test_type); } printfQuda(" - smearing steps %d\n", gauge_smear_steps); + printfQuda(" - smearing ignore direction %d\n", gauge_smear_dir_ignore); printfQuda(" - Measurement interval %d\n", measurement_interval); printfQuda("Grid partition info: X Y Z T\n"); @@ -63,6 +73,7 @@ void add_su3_option_group(std::shared_ptr quda_app) CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, + {"hyp", QUDA_GAUGE_SMEAR_HYP}, {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; @@ -72,7 +83,7 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup ->add_option( "--su3-smear-type", - gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, Wilson Flow, Symanzik Flow (default stout)") + gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); ; opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); @@ -83,6 +94,12 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon, "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)"); + opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); + opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); + opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); + + opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, "direction to be ignored by the smearing (default 4)"); + opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)"); opgroup->add_option("--su3-measurement-interval", measurement_interval, @@ -251,12 +268,17 @@ int main(int argc, char **argv) smear_param.alpha = gauge_smear_alpha; smear_param.rho = gauge_smear_rho; smear_param.epsilon = gauge_smear_epsilon; + smear_param.alpha1 = gauge_smear_alpha1; + smear_param.alpha2 = gauge_smear_alpha2; + smear_param.alpha3 = gauge_smear_alpha3; + smear_param.dir_ignore = gauge_smear_dir_ignore; host_timer.start(); // start the timer switch (smear_param.smear_type) { case QUDA_GAUGE_SMEAR_APE: case QUDA_GAUGE_SMEAR_STOUT: - case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: { + case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: + case QUDA_GAUGE_SMEAR_HYP: { performGaugeSmearQuda(&smear_param, obs_param); break; } diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index fd920e5e16..b213ee6f1e 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -189,6 +189,7 @@ const char *get_gauge_smear_str(QudaGaugeSmearType type) case QUDA_GAUGE_SMEAR_APE: ret = "APE"; break; case QUDA_GAUGE_SMEAR_STOUT: ret = "Stout"; break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: ret = "Over Improved"; break; + case QUDA_GAUGE_SMEAR_HYP: ret = "HYP"; break; case QUDA_GAUGE_SMEAR_WILSON_FLOW: ret = "Wilson Flow"; break; case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: ret = "Symanzik Flow"; break; default: ret = "unknown"; break; From 2089093d74d3ed8ee18c88765cee88036c8693d1 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Mon, 8 Jan 2024 22:33:00 +0800 Subject: [PATCH 03/22] Update `flops()` and `bytes()` for HYP smearing. Fix bugs in 3D HYP smearing. --- include/kernels/gauge_hyp.cuh | 20 +++++++++----------- lib/gauge_hyp.cu | 28 ++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 6184a31fa9..19f456304a 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -169,10 +169,10 @@ namespace quda } template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple &staple) + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) { using Link = typename get_type::type; - staple = Link(); + staple[0] = Link(); thread_array dx = { }; #pragma unroll @@ -200,7 +200,7 @@ namespace quda dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple = staple + U1 * U2 * conj(U3); + staple[0] = staple[0] + U1 * U2 * conj(U3); } { @@ -219,7 +219,7 @@ namespace quda dx[nu]++; // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple = staple + conj(U1) * U2 * U3; + staple[0] = staple[0] + conj(U1) * U2 * U3; } } } @@ -268,7 +268,7 @@ namespace quda arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U; } } else if constexpr (Arg::level == 3) { - computeStapleLevel3(arg, x, X, parity, dir, Stap[0]); + computeStapleLevel3(arg, x, X, parity, dir, Stap); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); polarSu3(TestU, arg.tolerance); @@ -337,7 +337,7 @@ namespace quda __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore) { using Link = typename get_type::type; - for (int i = 0; i < 1; ++i) staple[i] = Link(); + staple[0] = Link(); thread_array dx = { }; int cnt = -1; @@ -348,8 +348,6 @@ namespace quda // when used with STOUT or APE for measurement smearing) if (nu == mu || nu == dir_ignore) continue; - cnt += 1; - for (int rho = 0; rho < 4; ++rho) { if (rho == mu || rho == nu || rho == dir_ignore) continue; @@ -371,7 +369,7 @@ namespace quda dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + staple[0] = staple[0] + U1 * U2 * conj(U3); } { @@ -387,10 +385,10 @@ namespace quda // reset dx dx[mu]--; - dx[rho]++; + dx[nu]++; // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + staple[0] = staple[0] + conj(U1) * U2 * U3; } } } diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 7c587a97e1..b4860cd701 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -64,13 +64,33 @@ namespace quda { long long flops() const { auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll); - return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { + return ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + } else if (hypDim == 4 && level == 2) { + return ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume(); + } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { + return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + } } - long long bytes() const // 6 links per dim, 1 in, 1 out. + long long bytes() const { - return ((1 + (hypDim - 1) * 6) * in.Reconstruct() * in.Precision() + - out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume(); + if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp + return (in.Reconstruct() * in.Precision() + + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + * hypDim * in.LocalVolume(); + } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp + return (in.Reconstruct() * in.Precision() + + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + * hypDim * in.LocalVolume(); + } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out + return (in.Reconstruct() * in.Precision() + + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + out.Reconstruct() * out.Precision()) + * hypDim * in.LocalVolume(); + } } }; // GaugeAPE From 4d073f75d27ff03ae1b7567c0cec49a8e38dce1b Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 10 Jan 2024 12:55:07 +0800 Subject: [PATCH 04/22] Make compiler happy with STRICT build. --- include/kernels/gauge_hyp.cuh | 1 - lib/gauge_hyp.cu | 28 ++++++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 19f456304a..e30091a02c 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -340,7 +340,6 @@ namespace quda staple[0] = Link(); thread_array dx = { }; - int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index b4860cd701..84281c0b1a 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -63,34 +63,38 @@ namespace quda { long long flops() const { + long long flops = 0; auto mat_flops = in.Ncolor() * in.Ncolor() * (8ll * in.Ncolor() - 2ll); if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { - return ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + flops += ((hypDim - 1) * 2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); } else if (hypDim == 4 && level == 2) { - return ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume(); + flops += ((hypDim - 1) * 2 + (hypDim - 1) * (hypDim - 2) * 4) * mat_flops * hypDim * in.LocalVolume(); } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { - return (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); + flops += (2 + (hypDim - 1) * 4) * mat_flops * hypDim * in.LocalVolume(); } + return flops; } long long bytes() const { + long long bytes = 0; if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp - return (in.Reconstruct() * in.Precision() - + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() - + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + bytes += (in.Reconstruct() * in.Precision() + + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) * hypDim * in.LocalVolume(); } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp - return (in.Reconstruct() * in.Precision() - + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() - + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) + bytes += (in.Reconstruct() * in.Precision() + + (hypDim - 1) * (hypDim - 2) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) * hypDim * in.LocalVolume(); } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out - return (in.Reconstruct() * in.Precision() - + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() - + out.Reconstruct() * out.Precision()) + bytes += (in.Reconstruct() * in.Precision() + + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + + out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume(); } + return bytes; } }; // GaugeAPE From 0fc4cb4123531c1b2d40d6cd42a61803ca47f44a Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 10 Jan 2024 13:06:14 +0800 Subject: [PATCH 05/22] Set default tolerance of SU3 projection 10x epsilon. This solves #1425. --- include/kernels/gauge_ape.cuh | 4 ++-- include/kernels/gauge_hyp.cuh | 4 ++-- lib/gauge_observable.cpp | 2 +- lib/interface_quda.cpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh index 226f426f73..7bee11651e 100644 --- a/include/kernels/gauge_ape.cuh +++ b/include/kernels/gauge_ape.cuh @@ -8,8 +8,8 @@ namespace quda { -#define DOUBLE_TOL 1e-15 -#define SINGLE_TOL 2e-6 +#define DOUBLE_TOL 2e-15 +#define SINGLE_TOL 1e-6 template struct GaugeAPEArg : kernel_param<> { diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index e30091a02c..886b2c688f 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -8,8 +8,8 @@ namespace quda { -#define DOUBLE_TOL 1e-15 -#define SINGLE_TOL 2e-6 +#define DOUBLE_TOL 2e-15 +#define SINGLE_TOL 1e-6 template struct GaugeHYPArg : kernel_param<> { diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index 041dc6164d..5994cf155f 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -12,7 +12,7 @@ namespace quda int *num_failures_h = static_cast(pool_pinned_malloc(sizeof(int))); int *num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); *num_failures_h = 0; - auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 1e-14 : 1e-6; + auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6; projectSU3(u, tol, num_failures_d); if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); pool_pinned_free(num_failures_h); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index cc8ffa6254..6555314dc7 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3746,7 +3746,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, // project onto SU(3) if using the Chroma convention if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) { *num_failures_h = 0; - const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6; + const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6; if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase(); projectSU3(unitarizedLink, tol, num_failures_d); if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied) From e240272118d7281e27a2abc95f212520b89474e7 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 10 Jan 2024 13:07:16 +0800 Subject: [PATCH 06/22] clang-format --- include/gauge_tools.h | 3 ++- include/kernels/gauge_ape.cuh | 6 ++--- include/kernels/gauge_hyp.cuh | 45 +++++++++++++++++++---------------- include/quda.h | 8 +++---- lib/gauge_ape.cu | 2 +- lib/gauge_hyp.cu | 26 ++++++++++---------- lib/gauge_stout.cu | 2 +- lib/interface_quda.cpp | 3 ++- tests/su3_test.cpp | 3 ++- 9 files changed, 53 insertions(+), 45 deletions(-) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 46a84fed83..cfd3f1e8ba 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -130,7 +130,8 @@ namespace quda @param[in] alpha3 smearing parameter @param[in] dir_ignore ignored direction */ - void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore); + void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, + double alpha3, int dir_ignore); /** @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field. diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh index 7bee11651e..60b3b6a1a0 100644 --- a/include/kernels/gauge_ape.cuh +++ b/include/kernels/gauge_ape.cuh @@ -8,8 +8,8 @@ namespace quda { -#define DOUBLE_TOL 2e-15 -#define SINGLE_TOL 1e-6 +#define DOUBLE_TOL 2e-15 +#define SINGLE_TOL 1e-6 template struct GaugeAPEArg : kernel_param<> { @@ -72,7 +72,7 @@ namespace quda // Get link U U = arg.in(dir, linkIndexShift(x, dx, X), parity); - + Stap = Stap * (arg.alpha / ((real)(2 * (Arg::apeDim - 1)))); setIdentity(&I); diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 886b2c688f..6988908776 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -8,8 +8,8 @@ namespace quda { -#define DOUBLE_TOL 2e-15 -#define SINGLE_TOL 1e-6 +#define DOUBLE_TOL 2e-15 +#define SINGLE_TOL 1e-6 template struct GaugeHYPArg : kernel_param<> { @@ -19,13 +19,13 @@ namespace quda static constexpr QudaReconstructType recon = recon_; static constexpr int hypDim = hypDim_; static constexpr int level = level_; - typedef typename gauge_mapper::type Gauge; + typedef typename gauge_mapper::type Gauge; Gauge out; Gauge tmp[4]; const Gauge in; - int X[4]; // grid dimensions + int X[4]; // grid dimensions int border[4]; const Float alpha; const int dir_ignore; @@ -34,7 +34,7 @@ namespace quda GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) : kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)), out(out), - tmp{*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, + tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, in(in), alpha(alpha), dir_ignore(dir_ignore), @@ -48,12 +48,13 @@ namespace quda }; template - __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, + const int mu, Staple staple[3]) { using Link = typename get_type::type; for (int i = 0; i < 3; ++i) staple[i] = Link(); - thread_array dx = { }; + thread_array dx = {}; int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; ++nu) { @@ -104,12 +105,13 @@ namespace quda } template - __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, + const int mu, Staple staple[3]) { using Link = typename get_type::type; for (int i = 0; i < 3; ++i) staple[i] = Link(); - thread_array dx = { }; + thread_array dx = {}; int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; nu++) { @@ -169,14 +171,15 @@ namespace quda } template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[3]) + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, + const int mu, Staple staple[3]) { using Link = typename get_type::type; staple[0] = Link(); - thread_array dx = { }; + thread_array dx = {}; #pragma unroll - for (int nu = 0; nu < 4 ; nu++) { + for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) @@ -226,8 +229,8 @@ namespace quda template struct HYP { const Arg &arg; - constexpr HYP(const Arg &arg) : arg(arg) {} - static constexpr const char* filename() { return KERNEL_FILE; } + constexpr HYP(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } __device__ __host__ inline void operator()(int x_cb, int parity, int dir) { @@ -278,12 +281,13 @@ namespace quda }; template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], const int dir_ignore) + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, + const int mu, Staple staple[2], const int dir_ignore) { using Link = typename get_type::type; for (int i = 0; i < 2; ++i) staple[i] = Link(); - thread_array dx = { }; + thread_array dx = {}; int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; ++nu) { @@ -334,12 +338,13 @@ namespace quda } template - __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, const int mu, Staple staple[2], int dir_ignore) + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, + const int mu, Staple staple[2], int dir_ignore) { using Link = typename get_type::type; staple[0] = Link(); - thread_array dx = { }; + thread_array dx = {}; #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and @@ -395,8 +400,8 @@ namespace quda template struct HYP3D { const Arg &arg; - constexpr HYP3D(const Arg &arg) : arg(arg) {} - static constexpr const char* filename() { return KERNEL_FILE; } + constexpr HYP3D(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } __device__ __host__ inline void operator()(int x_cb, int parity, int dir) { diff --git a/include/quda.h b/include/quda.h index ccef6e3d80..8289c4e3d0 100644 --- a/include/quda.h +++ b/include/quda.h @@ -846,12 +846,12 @@ extern "C" { Wilson/Symanzik flow */ double alpha; /**< The single coefficient used in APE smearing */ double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ - double alpha1; /**< The first coefficient used in HYP smearing */ - double alpha2; /**< The second coefficient used in HYP smearing */ - double alpha3; /**< The third coefficient used in HYP smearing */ + double alpha1; /**< The first coefficient used in HYP smearing */ + double alpha2; /**< The second coefficient used in HYP smearing */ + double alpha3; /**< The third coefficient used in HYP smearing */ unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ QudaGaugeSmearType smear_type; /**< The smearing type to perform */ - int dir_ignore; /**< The direction to be ignored by the smearing algorithm */ + int dir_ignore; /**< The direction to be ignored by the smearing algorithm */ } QudaGaugeSmearParam; typedef struct QudaBLASParam_s { diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index 3f51ef566a..8d61066041 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -59,7 +59,7 @@ namespace quda { }; // GaugeAPE - void APEStep(GaugeField &out, GaugeField& in, double alpha, int dir_ignore) + void APEStep(GaugeField &out, GaugeField &in, double alpha, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 84281c0b1a..331d1cfbf8 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -4,7 +4,8 @@ #include #include -namespace quda { +namespace quda +{ template class GaugeHYP : TunableKernel3D { @@ -23,7 +24,7 @@ namespace quda { GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), - tmp{tmp[0], tmp[1], tmp[2], tmp[3]}, + tmp {tmp[0], tmp[1], tmp[2], tmp[3]}, in(in), alpha(static_cast(alpha)), level(level), @@ -43,17 +44,17 @@ namespace quda { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (hypDim == 4) { if (level == 1) { - launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); } else if (level == 2) { - launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); } else if (level == 3) { - launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); } } else if (hypDim == 3) { if (level == 1) { - launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); } else if (level == 2) { - launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); + launch(tp, stream, GaugeHYPArg(out, tmp, in, alpha, dir_ignore)); } } } @@ -79,8 +80,7 @@ namespace quda { { long long bytes = 0; if ((hypDim == 4 && level == 1) || (hypDim == 3 && level == 1)) { // 6 links per dim, 1 in, hypDim-1 tmp - bytes += (in.Reconstruct() * in.Precision() - + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() + bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * in.Reconstruct() * in.Precision() + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) * hypDim * in.LocalVolume(); } else if (hypDim == 4 && level == 2) { // 6 links per dim, 1 in, hypDim-1 tmp @@ -89,8 +89,7 @@ namespace quda { + (hypDim - 1) * tmp[0]->Reconstruct() * tmp[0]->Precision()) * hypDim * in.LocalVolume(); } else if ((hypDim == 4 && level == 3) || (hypDim == 3 && level == 2)) { // 6 links per dim, 1 in, 1 out - bytes += (in.Reconstruct() * in.Precision() - + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + bytes += (in.Reconstruct() * in.Precision() + (hypDim - 1) * 6 * tmp[0]->Reconstruct() * tmp[0]->Precision() + out.Reconstruct() * out.Precision()) * hypDim * in.LocalVolume(); } @@ -99,7 +98,8 @@ namespace quda { }; // GaugeAPE - void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField& in, double alpha1, double alpha2, double alpha3, int dir_ignore) + void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField &in, double alpha1, double alpha2, double alpha3, + int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); @@ -128,4 +128,4 @@ namespace quda { } } -} \ No newline at end of file +} // namespace quda \ No newline at end of file diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index 2a234d861b..2cf1c4e634 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -99,7 +99,7 @@ namespace quda { out.exchangeExtendedGhost(out.R(), false); } - void OvrImpSTOUTStep(GaugeField &out, GaugeField& in, double rho, double epsilon, int dir_ignore) + void OvrImpSTOUTStep(GaugeField &out, GaugeField &in, double rho, double epsilon, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 6555314dc7..bb144ba4df 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5110,7 +5110,8 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); break; case QUDA_GAUGE_SMEAR_HYP: - HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore); + HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, + smear_param->dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index 3adc58e6d7..acd2803b42 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -98,7 +98,8 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, "direction to be ignored by the smearing (default 4)"); + opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, + "direction to be ignored by the smearing (default 4)"); opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)"); From 2839225e7eda42b43ccb8dadc88f3f99c1673662 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 10 Jan 2024 14:42:06 +0800 Subject: [PATCH 07/22] Use `GaugeField::toleranceSU3()` to get the default tol for SU(3) projection. --- include/gauge_field.h | 13 +++++++++++++ include/kernels/gauge_ape.cuh | 5 +---- include/kernels/gauge_hyp.cuh | 5 +---- lib/gauge_observable.cpp | 2 +- lib/interface_quda.cpp | 2 +- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index ffea94c22b..a62b6a684f 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -638,6 +638,19 @@ namespace quda { */ static bool are_compatible_weak(const GaugeField &a, const GaugeField &b); + /** + * @brief Helper function that returns the default tolerance used by SU(3) projection + * @return The default tolerance, which is ~10x epsilon + */ + double toleranceSU3() const + { + switch (precision) { + case QUDA_DOUBLE_PRECISION: return 2e-15; + case QUDA_SINGLE_PRECISION: return 1e-6; + default: return 1e-6; + } + } + friend struct GaugeFieldParam; }; diff --git a/include/kernels/gauge_ape.cuh b/include/kernels/gauge_ape.cuh index 60b3b6a1a0..f8e00d03db 100644 --- a/include/kernels/gauge_ape.cuh +++ b/include/kernels/gauge_ape.cuh @@ -8,9 +8,6 @@ namespace quda { -#define DOUBLE_TOL 2e-15 -#define SINGLE_TOL 1e-6 - template struct GaugeAPEArg : kernel_param<> { using Float = Float_; @@ -35,7 +32,7 @@ namespace quda in(in), alpha(alpha), dir_ignore(dir_ignore), - tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL) + tolerance(in.toleranceSU3()) { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 6988908776..a2a74863f6 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -8,9 +8,6 @@ namespace quda { -#define DOUBLE_TOL 2e-15 -#define SINGLE_TOL 1e-6 - template struct GaugeHYPArg : kernel_param<> { using Float = Float_; @@ -38,7 +35,7 @@ namespace quda in(in), alpha(alpha), dir_ignore(dir_ignore), - tolerance(in.Precision() == QUDA_DOUBLE_PRECISION ? DOUBLE_TOL : SINGLE_TOL) + tolerance(in.toleranceSU3()) { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index 5994cf155f..37f8061686 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -12,7 +12,7 @@ namespace quda int *num_failures_h = static_cast(pool_pinned_malloc(sizeof(int))); int *num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); *num_failures_h = 0; - auto tol = u.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6; + auto tol = u.toleranceSU3(); projectSU3(u, tol, num_failures_d); if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); pool_pinned_free(num_failures_h); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index bb144ba4df..a01c75998a 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3746,7 +3746,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, // project onto SU(3) if using the Chroma convention if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) { *num_failures_h = 0; - const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 2e-15 : 1e-6; + const double tol = unitarizedLink.toleranceSU3(); if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase(); projectSU3(unitarizedLink, tol, num_failures_d); if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied) From 1133b3fccbec17194c52674b85086181b97ff85e Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 31 Jan 2024 21:06:18 -0800 Subject: [PATCH 08/22] Moved allocating temporaries into the HYPSmear routine --- include/gauge_tools.h | 3 +-- include/kernels/gauge_hyp.cuh | 2 +- lib/gauge_hyp.cu | 27 ++++++++++++++++++++++++--- lib/interface_quda.cpp | 20 +------------------- 4 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index cfd3f1e8ba..4c782d0ddc 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -123,14 +123,13 @@ namespace quda /** @brief Apply HYP smearing to the gauge field @param[out] dataDs Output smeared field - @param[in] dataTemp Temp space @param[in] dataOr Input gauge field @param[in] alpha1 smearing parameter @param[in] alpha2 smearing parameter @param[in] alpha3 smearing parameter @param[in] dir_ignore ignored direction */ - void HYPStep(GaugeField &dataDs, GaugeField *dataTemp[4], GaugeField &dataOr, double alpha1, double alpha2, + void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore); /** diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index a2a74863f6..ec7e09e05b 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -28,7 +28,7 @@ namespace quda const int dir_ignore; const Float tolerance; - GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) : + GaugeHYPArg(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int dir_ignore) : kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)), out(out), tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 331d1cfbf8..2090d19dd5 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -10,7 +10,7 @@ namespace quda template class GaugeHYP : TunableKernel3D { GaugeField &out; - GaugeField *tmp[4]; + GaugeField* tmp[4]; const GaugeField ∈ const Float alpha; const int level; @@ -21,7 +21,7 @@ namespace quda public: // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim - GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : + GaugeHYP(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), tmp {tmp[0], tmp[1], tmp[2], tmp[3]}, @@ -98,13 +98,31 @@ namespace quda }; // GaugeAPE - void HYPStep(GaugeField &out, GaugeField *tmp[4], GaugeField &in, double alpha1, double alpha2, double alpha3, + void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); checkNative(out, in); + GaugeFieldParam gParam(out); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4; + //GaugeField tmp[4]; + gParam.geometry = QUDA_TENSOR_GEOMETRY; + GaugeFieldParam gParam2(gParam); + gParam2.create = QUDA_REFERENCE_FIELD_CREATE; + + GaugeField* tmp[4]; + if (smearDim == 3) { + tmp[0] = new GaugeField(gParam); + // aux[1], aux[2] and aux[3] will not be used for smearDim == 3 + gParam.create = QUDA_REFERENCE_FIELD_CREATE; + for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } + } else { + for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } + } + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } if (dir_ignore == 4) { @@ -126,6 +144,9 @@ namespace quda instantiate(out, tmp, in, alpha2, 2, dir_ignore); out.exchangeExtendedGhost(out.R(), false); } + + for (int i = 0; i < 4; i++) delete tmp[i]; + } } // namespace quda \ No newline at end of file diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index db137a705e..756d7c089a 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5082,20 +5082,6 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable gParam.location = QUDA_CUDA_FIELD_LOCATION; GaugeField tmp(gParam); - const int smearDim = (smear_param->dir_ignore >= 0 && smear_param->dir_ignore <= 3) ? 3 : 4; - GaugeField *aux[4]; - if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) { - gParam.geometry = QUDA_TENSOR_GEOMETRY; - if (smearDim == 3) { - aux[0] = new GaugeField(gParam); - // aux[1], aux[2] and aux[3] will not be used for smearDim=3 - gParam.create = QUDA_REFERENCE_FIELD_CREATE; - for (int i = 1; i < 4; ++i) { aux[i] = new GaugeField(gParam); } - } else { - for (int i = 0; i < 4; ++i) { aux[i] = new GaugeField(gParam); } - } - } - int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); @@ -5108,7 +5094,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); break; case QUDA_GAUGE_SMEAR_HYP: - HYPStep(*gaugeSmeared, aux, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, + HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); @@ -5121,10 +5107,6 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable } } - if (smear_param->smear_type == QUDA_GAUGE_SMEAR_HYP) { - for (int i = 0; i < 4; ++i) { delete aux[i]; } - } - popOutputPrefix(); } From 9545d28f83b272bf45d108f60fa025294cbf7dae Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 31 Jan 2024 21:37:58 -0800 Subject: [PATCH 09/22] Code cleanup, introducing extended field coordinate E --- include/kernels/gauge_hyp.cuh | 127 ++++++++++++++++------------------ 1 file changed, 61 insertions(+), 66 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index ec7e09e05b..8e116dcb65 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -22,6 +22,7 @@ namespace quda Gauge tmp[4]; const Gauge in; + int E[4]; // extended grid dimensions int X[4]; // grid dimensions int border[4]; const Float alpha; @@ -39,13 +40,14 @@ namespace quda { for (int dir = 0; dir < 4; ++dir) { border[dir] = in.R()[dir]; + E[dir] = in.X()[dir]; X[dir] = in.X()[dir] - border[dir] * 2; } } }; - template - __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const Int *X, const int parity, + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const int parity, const int mu, Staple staple[3]) { using Link = typename get_type::type; @@ -64,16 +66,16 @@ namespace quda { // Get link U_{\nu}(x) - Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity); + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); // Get link U_{\mu}(x+\nu) dx[nu]++; - Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[nu]--; // Get link U_{\nu}(x+\mu) dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) @@ -83,13 +85,13 @@ namespace quda { // Get link U_{\nu}(x-\nu) dx[nu]--; - Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\mu}(x-\nu) - Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\nu}(x-\nu+\mu) dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity); + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); // reset dx dx[mu]--; @@ -101,8 +103,8 @@ namespace quda } } - template - __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const Int *X, const int parity, + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const int parity, const int mu, Staple staple[3]) { using Link = typename get_type::type; @@ -129,16 +131,16 @@ namespace quda { // Get link U_{\rho}(x) - Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity); + Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity); // Get link U_{\mu}(x+\rho) dx[rho]++; - Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[rho]--; // Get link U_{\rho}(x+\mu) dx[mu]++; - Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity); + Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu) @@ -148,13 +150,13 @@ namespace quda { // Get link U_{\rho}(x-\rho) dx[rho]--; - Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), 1 - parity); + Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\mu}(x-\rho) - Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\rho}(x-\rho+\mu) dx[mu]++; - Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, X), parity); + Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity); // reset dx dx[mu]--; @@ -167,8 +169,8 @@ namespace quda } } - template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const Int *X, const int parity, + template + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity, const int mu, Staple staple[3]) { using Link = typename get_type::type; @@ -187,16 +189,16 @@ namespace quda { // Get link U_{\nu}(x) - Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity); + Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity); // Get link U_{\mu}(x+\nu) dx[nu]++; - Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[nu]--; // Get link U_{\nu}(x+\mu) dx[mu]++; - Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity); + Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) @@ -206,13 +208,13 @@ namespace quda { // Get link U_{\nu}(x-\nu) dx[nu]--; - Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), 1 - parity); + Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\mu}(x-\nu) - Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\nu}(x-\nu+\mu) dx[mu]++; - Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, X), parity); + Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity); // reset dx dx[mu]--; @@ -235,50 +237,46 @@ namespace quda typedef Matrix, Arg::nColor> Link; // compute spacetime and local coords - int X[4]; - for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr]; int x[4]; - getCoords(x, x_cb, X, parity); - for (int dr = 0; dr < 4; ++dr) { - x[dr] += arg.border[dr]; - X[dr] += 2 * arg.border[dr]; - } + getCoords(x, x_cb, arg.E, parity); +#pragma unroll + for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates int dx[4] = {0, 0, 0, 0}; Link U, Stap[3], TestU, I; // Get link U - U = arg.in(dir, linkIndexShift(x, dx, X), parity); + U = arg.in(dir, linkIndexShift(x, dx, arg.X), parity); setIdentity(&I); if constexpr (Arg::level == 1) { - computeStapleLevel1(arg, x, X, parity, dir, Stap); + computeStapleLevel1(arg, x, parity, dir, Stap); for (int i = 0; i < 3; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); polarSu3(TestU, arg.tolerance); - arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 2) { - computeStapleLevel2(arg, x, X, parity, dir, Stap); + computeStapleLevel2(arg, x, parity, dir, Stap); for (int i = 0; i < 3; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.)); polarSu3(TestU, arg.tolerance); - arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 3) { - computeStapleLevel3(arg, x, X, parity, dir, Stap); + computeStapleLevel3(arg, x, parity, dir, Stap); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); polarSu3(TestU, arg.tolerance); - arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U; + arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } }; - template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const Int *X, const int parity, + template + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const int parity, const int mu, Staple staple[2], const int dir_ignore) { using Link = typename get_type::type; @@ -297,16 +295,16 @@ namespace quda { // Get link U_{\nu}(x) - Link U1 = arg.in(nu, linkIndexShift(x, dx, X), parity); + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); // Get link U_{\mu}(x+\nu) dx[nu]++; - Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[nu]--; // Get link U_{\nu}(x+\mu) dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) @@ -316,13 +314,13 @@ namespace quda { // Get link U_{\nu}(x-\nu) dx[nu]--; - Link U1 = arg.in(nu, linkIndexShift(x, dx, X), 1 - parity); + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\mu}(x-\nu) - Link U2 = arg.in(mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\nu}(x-\nu+\mu) dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, X), parity); + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); // reset dx dx[mu]--; @@ -334,8 +332,8 @@ namespace quda } } - template - __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const Int *X, const int parity, + template + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const int parity, const int mu, Staple staple[2], int dir_ignore) { using Link = typename get_type::type; @@ -357,16 +355,16 @@ namespace quda { // Get link U_{\nu}(x) - Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity); + Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity); // Get link U_{\mu}(x+\nu) dx[nu]++; - Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[nu]--; // Get link U_{\nu}(x+\mu) dx[mu]++; - Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity); + Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) @@ -376,13 +374,13 @@ namespace quda { // Get link U_{\nu}(x-\nu) dx[nu]--; - Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), 1 - parity); + Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\mu}(x-\nu) - Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, X), 1 - parity); + Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); // Get link U_{\nu}(x-\nu+\mu) dx[mu]++; - Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, X), parity); + Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity); // reset dx dx[mu]--; @@ -406,14 +404,11 @@ namespace quda typedef Matrix, Arg::nColor> Link; // compute spacetime and local coords - int X[4]; - for (int dr = 0; dr < 4; ++dr) X[dr] = arg.X[dr]; int x[4]; - getCoords(x, x_cb, X, parity); - for (int dr = 0; dr < 4; ++dr) { - x[dr] += arg.border[dr]; - X[dr] += 2 * arg.border[dr]; - } + getCoords(x, x_cb, arg.X, parity); +#pragma unroll + for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + int dir_ = dir; dir = dir + (dir >= arg.dir_ignore); @@ -421,23 +416,23 @@ namespace quda Link U, Stap[2], TestU, I; // Get link U - U = arg.in(dir, linkIndexShift(x, dx, X), parity); + U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity); setIdentity(&I); if constexpr (Arg::level == 1) { - computeStaple3DLevel1(arg, x, X, parity, dir, Stap, arg.dir_ignore); + computeStaple3DLevel1(arg, x, parity, dir, Stap, arg.dir_ignore); for (int i = 0; i < 2; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); polarSu3(TestU, arg.tolerance); - arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, X), parity) = TestU * U; + arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 2) { - computeStaple3DLevel2(arg, x, X, parity, dir, Stap, arg.dir_ignore); + computeStaple3DLevel2(arg, x, parity, dir, Stap, arg.dir_ignore); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.)); polarSu3(TestU, arg.tolerance); - arg.out(dir, linkIndexShift(x, dx, X), parity) = TestU * U; + arg.out(dir, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } }; From 6685b14cc5b34aeaac88bc5311d3c75ff193bdc3 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 31 Jan 2024 22:08:19 -0800 Subject: [PATCH 10/22] Partial progress towards HYP cleanup --- include/kernels/gauge_hyp.cuh | 143 +++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 46 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 8e116dcb65..3ff340d363 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -20,7 +20,7 @@ namespace quda Gauge out; Gauge tmp[4]; - const Gauge in; + Gauge in; int E[4]; // extended grid dimensions int X[4]; // grid dimensions @@ -46,11 +46,98 @@ namespace quda } }; - template - __host__ __device__ inline void computeStapleLevel1(const Arg &arg, const int *x, const int parity, - const int mu, Staple staple[3]) + template + __host__ __device__ inline void computeStaple(Matrix, Arg::nColor> &staple, const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity) { - using Link = typename get_type::type; + using Link = Matrix, Arg::nColor>; + int dx[4] = {0, 0, 0, 0}; + + /* Computes the upper staple : + * mu (B) + * +-------+ + * nu | | + * (A) | |(C) + * X X + */ + { + /* load matrix A*/ + Link a = gauge_nu(nu, linkIndex(x, arg.E), parity); + + /* load matrix B*/ + dx[nu]++; + Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity); + dx[nu]--; + + /* load matrix C*/ + dx[mu]++; + Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity); + dx[mu]--; + + staple = a * b * conj(c); + } + + /* Computes the lower staple : + * X X + * nu | | + * (A) | | (C) + * +-------+ + * mu (B) + */ + { + /* load matrix A*/ + dx[nu]--; + Link a = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity); + + /* load matrix B*/ + Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity); + + /* load matrix C*/ + dx[mu]++; + Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), parity); + dx[mu]--; + dx[nu]++; + + staple = staple + conj(a)*b*c; + } + } + + template + __host__ __device__ inline void computeStaple(Matrix, Arg::nColor> &staple, const Arg &arg, + const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int mu, int nu) + { + switch (mu) { + case 0: + switch (nu) { + case 1: computeStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 2: computeStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 3: computeStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + } break; + case 1: + switch (nu) { + case 0: computeStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 2: computeStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 3: computeStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + } break; + case 2: + switch (nu) { + case 0: computeStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 1: computeStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 3: computeStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + } break; + case 3: + switch (nu) { + case 0: computeStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 1: computeStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 2: computeStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + } break; + } + } + + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], int parity, + int mu, Matrix, Arg::nColor> staple[3]) + { + using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); thread_array dx = {}; @@ -64,42 +151,7 @@ namespace quda cnt += 1; - { - // Get link U_{\nu}(x) - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\nu) - dx[nu]++; - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[nu]--; - - // Get link U_{\nu}(x+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); - } - - { - // Get link U_{\nu}(x-\nu) - dx[nu]--; - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\nu) - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\nu}(x-\nu+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[nu]++; - - // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; - } + computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu); } } @@ -169,12 +221,11 @@ namespace quda } } - template + template __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity, - const int mu, Staple staple[3]) + const int mu, Matrix, Arg::nColor> staple[3]) { - using Link = typename get_type::type; - staple[0] = Link(); + using Link = Matrix, Arg::nColor>; thread_array dx = {}; #pragma unroll @@ -234,7 +285,7 @@ namespace quda __device__ __host__ inline void operator()(int x_cb, int parity, int dir) { using real = typename Arg::Float; - typedef Matrix, Arg::nColor> Link; + using Link = Matrix, Arg::nColor>; // compute spacetime and local coords int x[4]; From 2640ae923a477c117de464e95d11ee81687ca40c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 08:43:36 -0800 Subject: [PATCH 11/22] Cleanup/bugfixes --- include/kernels/gauge_hyp.cuh | 66 +++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 3ff340d363..fd948e57c7 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace quda @@ -22,8 +23,8 @@ namespace quda Gauge tmp[4]; Gauge in; - int E[4]; // extended grid dimensions - int X[4]; // grid dimensions + int_fastdiv E[4]; // extended grid dimensions + int_fastdiv X[4]; // grid dimensions int border[4]; const Float alpha; const int dir_ignore; @@ -55,9 +56,9 @@ namespace quda /* Computes the upper staple : * mu (B) * +-------+ - * nu | | - * (A) | |(C) - * X X + * nu | | + * (A) | |(C) + * X X */ { /* load matrix A*/ @@ -79,9 +80,9 @@ namespace quda /* Computes the lower staple : * X X * nu | | - * (A) | | (C) - * +-------+ - * mu (B) + * (A) | | (C) + * +-------+ + * mu (B) */ { /* load matrix A*/ @@ -152,14 +153,49 @@ namespace quda cnt += 1; computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu); + + /*{ + // Get link U_{\nu}(x) + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); + + // Get link U_{\mu}(x+\nu) + dx[nu]++; + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); + dx[nu]--; + + // Get link U_{\nu}(x+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); + dx[mu]--; + + // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) + staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); + } + { + // Get link U_{\nu}(x-\nu) + dx[nu]--; + Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); + // Get link U_{\mu}(x-\nu) + Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); + + // Get link U_{\nu}(x-\nu+\mu) + dx[mu]++; + Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); + + // reset dx + dx[mu]--; + dx[nu]++; + // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) + staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; + }*/ } } - template - __host__ __device__ inline void computeStapleLevel2(const Arg &arg, const int *x, const int parity, - const int mu, Staple staple[3]) + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], int parity, + int mu, Matrix, Arg::nColor> staple[3]) { - using Link = typename get_type::type; + using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); thread_array dx = {}; @@ -181,6 +217,8 @@ namespace quda const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + //computeStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, sigma_with_mu, sigma_with_rho); + { // Get link U_{\rho}(x) Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity); @@ -289,7 +327,7 @@ namespace quda // compute spacetime and local coords int x[4]; - getCoords(x, x_cb, arg.E, parity); + getCoords(x, x_cb, arg.X, parity); #pragma unroll for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates @@ -297,7 +335,7 @@ namespace quda Link U, Stap[3], TestU, I; // Get link U - U = arg.in(dir, linkIndexShift(x, dx, arg.X), parity); + U = arg.in(dir, linkIndexShift(x, dx, arg.E), parity); setIdentity(&I); if constexpr (Arg::level == 1) { From 4f2a6254eade5fcedf428c543ddd32c3e381673c Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 09:36:48 -0800 Subject: [PATCH 12/22] Cleaned up/corrected staple abstraction. Still generating a stack frame... --- include/kernels/gauge_hyp.cuh | 274 ++++++++-------------------------- 1 file changed, 64 insertions(+), 210 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index fd948e57c7..e226a29e2f 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -21,7 +21,7 @@ namespace quda Gauge out; Gauge tmp[4]; - Gauge in; + const Gauge in; int_fastdiv E[4]; // extended grid dimensions int_fastdiv X[4]; // grid dimensions @@ -47,8 +47,22 @@ namespace quda } }; + /** + @brief Calculates a staple as part of the HYP calculation, returning the product + + @param[in,out] staple The accumulated staple + @param[in] arg Kernel argument + @param[in] gauge_mu Gauge/tensor field used for the parallel direction + @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction + @param[in] x Full index array + @param[in] parity Parity index + @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields + @tparam The {parallel, perpendicular} pair of directions for coordinate offsets + */ template - __host__ __device__ inline void computeStaple(Matrix, Arg::nColor> &staple, const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity) + __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, + const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, + int x[], int parity, int2 tensor_arg) { using Link = Matrix, Arg::nColor>; int dx[4] = {0, 0, 0, 0}; @@ -62,19 +76,19 @@ namespace quda */ { /* load matrix A*/ - Link a = gauge_nu(nu, linkIndex(x, arg.E), parity); + Link a = gauge_nu(tensor_arg.y, linkIndex(x, arg.E), parity); /* load matrix B*/ dx[nu]++; - Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity); + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity); dx[nu]--; /* load matrix C*/ dx[mu]++; - Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity); + Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity); dx[mu]--; - staple = a * b * conj(c); + staple = staple + a * b * conj(c); } /* Computes the lower staple : @@ -87,14 +101,14 @@ namespace quda { /* load matrix A*/ dx[nu]--; - Link a = gauge_nu(nu, linkIndexShift(x, dx, arg.E), 1-parity); + Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity); /* load matrix B*/ - Link b = gauge_mu(mu, linkIndexShift(x, dx, arg.E), 1-parity); + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity); /* load matrix C*/ dx[mu]++; - Link c = gauge_nu(nu, linkIndexShift(x, dx, arg.E), parity); + Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), parity); dx[mu]--; dx[nu]++; @@ -102,34 +116,50 @@ namespace quda } } + /** + @brief Calculates a staple as part of the HYP calculation, returning the product + + @param[in,out] staple The accumulated staple + @param[in] arg Kernel argument + @param[in] gauge_mu Gauge/tensor field used for the parallel direction + @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction + @param[in] x Full index array + @param[in] parity Parity index + @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields + @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets + */ template - __host__ __device__ inline void computeStaple(Matrix, Arg::nColor> &staple, const Arg &arg, - const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int mu, int nu) + __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, + const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int2 tensor_arg, int2 shifts) { + // for readability + const int mu = shifts.x; + const int nu = shifts.y; + switch (mu) { case 0: switch (nu) { - case 1: computeStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 2: computeStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 3: computeStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; } break; case 1: switch (nu) { - case 0: computeStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 2: computeStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 3: computeStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; } break; case 2: switch (nu) { - case 0: computeStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 1: computeStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 3: computeStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; } break; case 3: switch (nu) { - case 0: computeStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 1: computeStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity); break; - case 2: computeStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity); break; + case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; } break; } } @@ -152,42 +182,8 @@ namespace quda cnt += 1; - computeStaple(staple[cnt], arg, arg.in, arg.in, x, parity, mu, nu); - - /*{ - // Get link U_{\nu}(x) - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\nu) - dx[nu]++; - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[nu]--; + accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, { mu, nu }, { mu, nu }); - // Get link U_{\nu}(x+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); - } - { - // Get link U_{\nu}(x-\nu) - dx[nu]--; - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\nu) - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\nu}(x-\nu+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[nu]++; - // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; - }*/ } } @@ -217,51 +213,14 @@ namespace quda const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); - //computeStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, sigma_with_mu, sigma_with_rho); - - { - // Get link U_{\rho}(x) - Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\rho) - dx[rho]++; - Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[rho]--; - - // Get link U_{\rho}(x+\mu) - dx[mu]++; - Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\rho}(x) * U_{\mu}(x+\rho) * U^\dag_{\rho}(x+\mu) - staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); - } - - { - // Get link U_{\rho}(x-\rho) - dx[rho]--; - Link U1 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\rho) - Link U2 = arg.tmp[mu / 2](sigma_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\rho}(x-\rho+\mu) - dx[mu]++; - Link U3 = arg.tmp[rho / 2](sigma_with_rho, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[rho]++; - - // staple += U^\dag_{\rho}(x-\rho) * U_{\mu}(x-\rho) * U_{\rho}(x-\rho+\mu) - staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; - } + accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); } } } template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, const int *x, const int parity, - const int mu, Matrix, Arg::nColor> staple[3]) + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], int parity, + int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; @@ -276,42 +235,7 @@ namespace quda const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); - { - // Get link U_{\nu}(x) - Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\nu) - dx[nu]++; - Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[nu]--; - - // Get link U_{\nu}(x+\mu) - dx[mu]++; - Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[0] = staple[0] + U1 * U2 * conj(U3); - } - - { - // Get link U_{\nu}(x-\nu) - dx[nu]--; - Link U1 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\nu) - Link U2 = arg.tmp[mu / 2 + 2](nu_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\nu}(x-\nu+\mu) - dx[mu]++; - Link U3 = arg.tmp[nu / 2 + 2](mu_with_nu, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[nu]++; - - // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[0] = staple[0] + conj(U1) * U2 * U3; - } + accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); } } @@ -365,8 +289,8 @@ namespace quda }; template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, const int *x, const int parity, - const int mu, Staple staple[2], const int dir_ignore) + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], int parity, + int mu, Staple staple[2], const int dir_ignore) { using Link = typename get_type::type; for (int i = 0; i < 2; ++i) staple[i] = Link(); @@ -382,48 +306,13 @@ namespace quda cnt += 1; - { - // Get link U_{\nu}(x) - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\nu) - dx[nu]++; - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[nu]--; - - // Get link U_{\nu}(x+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[cnt] = staple[cnt] + U1 * U2 * conj(U3); - } - - { - // Get link U_{\nu}(x-\nu) - dx[nu]--; - Link U1 = arg.in(nu, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\nu) - Link U2 = arg.in(mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\nu}(x-\nu+\mu) - dx[mu]++; - Link U3 = arg.in(nu, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[nu]++; - - // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[cnt] = staple[cnt] + conj(U1) * U2 * U3; - } + accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, {mu, nu}, {mu, nu}); } } template - __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, const int *x, const int parity, - const int mu, Staple staple[2], int dir_ignore) + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], int parity, + int mu, Staple staple[2], int dir_ignore) { using Link = typename get_type::type; staple[0] = Link(); @@ -442,42 +331,7 @@ namespace quda const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); - { - // Get link U_{\nu}(x) - Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity); - - // Get link U_{\mu}(x+\nu) - dx[nu]++; - Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[nu]--; - - // Get link U_{\nu}(x+\mu) - dx[mu]++; - Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); - dx[mu]--; - - // staple += U_{\nu}(x) * U_{\mu}(x+\nu) * U^\dag_{\nu}(x+\mu) - staple[0] = staple[0] + U1 * U2 * conj(U3); - } - - { - // Get link U_{\nu}(x-\nu) - dx[nu]--; - Link U1 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), 1 - parity); - // Get link U_{\mu}(x-\nu) - Link U2 = arg.tmp[0](rho_with_mu, linkIndexShift(x, dx, arg.E), 1 - parity); - - // Get link U_{\nu}(x-\nu+\mu) - dx[mu]++; - Link U3 = arg.tmp[0](rho_with_nu, linkIndexShift(x, dx, arg.E), parity); - - // reset dx - dx[mu]--; - dx[nu]++; - - // staple += U^\dag_{\nu}(x-\nu) * U_{\mu}(x-\nu) * U_{\nu}(x-\nu+\mu) - staple[0] = staple[0] + conj(U1) * U2 * U3; - } + accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); } } } From 0fd5776c63d6d479711218c5f1e43874683f8999 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 10:49:19 -0800 Subject: [PATCH 13/22] Cleanup before getting rid of templating over staples --- include/kernels/gauge_hyp.cuh | 114 +++++++++++++++++----------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index e226a29e2f..c07f75eee3 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -55,17 +55,17 @@ namespace quda @param[in] gauge_mu Gauge/tensor field used for the parallel direction @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction @param[in] x Full index array + @param[in,out] dx Full index offset array; leaves the function with the same values it entered with @param[in] parity Parity index @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields @tparam The {parallel, perpendicular} pair of directions for coordinate offsets */ - template + template __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, - int x[], int parity, int2 tensor_arg) + int x[], Coord &dx, int parity, int2 tensor_arg) { using Link = Matrix, Arg::nColor>; - int dx[4] = {0, 0, 0, 0}; /* Computes the upper staple : * mu (B) @@ -124,13 +124,15 @@ namespace quda @param[in] gauge_mu Gauge/tensor field used for the parallel direction @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction @param[in] x Full index array + @param[in,out] dx Full index offset array; leaves the function with the same values it entered with @param[in] parity Parity index @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets */ - template + template __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, - const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], int parity, int2 tensor_arg, int2 shifts) + const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, + int x[], Coord &dx, int parity, int2 tensor_arg, int2 shifts) { // for readability const int mu = shifts.x; @@ -139,39 +141,38 @@ namespace quda switch (mu) { case 0: switch (nu) { - case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; } break; case 1: switch (nu) { - case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; } break; case 2: switch (nu) { - case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; } break; case 3: switch (nu) { - case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; - case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, parity, tensor_arg); break; + case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; + case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; } break; } } - template - __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], int parity, + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], Coord &dx, int parity, int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); - thread_array dx = {}; int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; ++nu) { @@ -182,19 +183,18 @@ namespace quda cnt += 1; - accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, { mu, nu }, { mu, nu }); + accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu }); } } - template - __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], int parity, + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], Coord &dx, int parity, int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); - thread_array dx = {}; int cnt = -1; #pragma unroll for (int nu = 0; nu < 4; nu++) { @@ -213,18 +213,17 @@ namespace quda const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); - accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); + accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); } } } - template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], int parity, + template + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], Coord &dx, int parity, int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; - thread_array dx = {}; #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and @@ -235,7 +234,7 @@ namespace quda const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); - accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); + accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); } } @@ -255,7 +254,8 @@ namespace quda #pragma unroll for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates - int dx[4] = {0, 0, 0, 0}; + thread_array dx = {0, 0, 0, 0}; + Link U, Stap[3], TestU, I; // Get link U @@ -263,23 +263,25 @@ namespace quda setIdentity(&I); if constexpr (Arg::level == 1) { - computeStapleLevel1(arg, x, parity, dir, Stap); + computeStapleLevel1(arg, x, dx, parity, dir, Stap); +#pragma unroll for (int i = 0; i < 3; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); polarSu3(TestU, arg.tolerance); arg.tmp[dir / 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 2) { - computeStapleLevel2(arg, x, parity, dir, Stap); + computeStapleLevel2(arg, x, dx, parity, dir, Stap); +#pragma unroll for (int i = 0; i < 3; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)4.)); polarSu3(TestU, arg.tolerance); arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 3) { - computeStapleLevel3(arg, x, parity, dir, Stap); + computeStapleLevel3(arg, x, dx, parity, dir, Stap); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); polarSu3(TestU, arg.tolerance); @@ -288,36 +290,33 @@ namespace quda } }; - template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], int parity, - int mu, Staple staple[2], const int dir_ignore) + template + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], Coord &dx, int parity, + int mu, Matrix, Arg::nColor> staple[2], const int dir_ignore) { - using Link = typename get_type::type; + using Link = Matrix, Arg::nColor>; for (int i = 0; i < 2; ++i) staple[i] = Link(); - thread_array dx = {}; - int cnt = -1; + int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; ++nu) { - // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) - if (nu == mu || nu == dir_ignore) continue; - - cnt += 1; - - accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, parity, {mu, nu}, {mu, nu}); + if (nu != mu && nu != dir_ignore) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) + accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); + cnt++; + } } } - template - __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], int parity, - int mu, Staple staple[2], int dir_ignore) + template + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], Coord &dx, int parity, + int mu, Matrix, Arg::nColor> staple[2], int dir_ignore) { - using Link = typename get_type::type; + using Link = Matrix, Arg::nColor>; staple[0] = Link(); - thread_array dx = {}; #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and @@ -331,7 +330,7 @@ namespace quda const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); - accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); + accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); } } } @@ -352,10 +351,10 @@ namespace quda #pragma unroll for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + thread_array dx = {0, 0, 0, 0}; + int dir_ = dir; dir = dir + (dir >= arg.dir_ignore); - - int dx[4] = {0, 0, 0, 0}; Link U, Stap[2], TestU, I; // Get link U @@ -363,15 +362,16 @@ namespace quda setIdentity(&I); if constexpr (Arg::level == 1) { - computeStaple3DLevel1(arg, x, parity, dir, Stap, arg.dir_ignore); + computeStaple3DLevel1(arg, x, dx, parity, dir, Stap, arg.dir_ignore); +#pragma unroll for (int i = 0; i < 2; ++i) { TestU = I * (static_cast(1.0) - arg.alpha) + Stap[i] * conj(U) * (arg.alpha / ((real)2.)); polarSu3(TestU, arg.tolerance); arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 2) { - computeStaple3DLevel2(arg, x, parity, dir, Stap, arg.dir_ignore); + computeStaple3DLevel2(arg, x, dx, parity, dir, Stap, arg.dir_ignore); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.)); polarSu3(TestU, arg.tolerance); From 5f15d84b41c7b5c910039d5fd938f567e9d1bd82 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 11:54:04 -0800 Subject: [PATCH 14/22] Removed as much stack frame as possible, remaining frame is due to register count getting maxed out --- include/kernels/gauge_hyp.cuh | 187 ++++++++++++++-------------------- 1 file changed, 79 insertions(+), 108 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index c07f75eee3..9c18cabd59 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -50,7 +50,6 @@ namespace quda /** @brief Calculates a staple as part of the HYP calculation, returning the product - @param[in,out] staple The accumulated staple @param[in] arg Kernel argument @param[in] gauge_mu Gauge/tensor field used for the parallel direction @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction @@ -58,14 +57,20 @@ namespace quda @param[in,out] dx Full index offset array; leaves the function with the same values it entered with @param[in] parity Parity index @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields - @tparam The {parallel, perpendicular} pair of directions for coordinate offsets + @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets + @return The computed staple */ - template - __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, - const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, - int x[], Coord &dx, int parity, int2 tensor_arg) + template + __host__ __device__ inline Matrix, Arg::nColor> accumulateStaple(const Arg &arg, + const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, + int x[], thread_array &dx, int parity, int2 tensor_arg, int2 shifts) { using Link = Matrix, Arg::nColor>; + Link staple; + + // for readability + const int mu = shifts.x; + const int nu = shifts.y; /* Computes the upper staple : * mu (B) @@ -88,7 +93,7 @@ namespace quda Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity); dx[mu]--; - staple = staple + a * b * conj(c); + staple = a * b * conj(c); } /* Computes the lower staple : @@ -114,113 +119,74 @@ namespace quda staple = staple + conj(a)*b*c; } - } - - /** - @brief Calculates a staple as part of the HYP calculation, returning the product - - @param[in,out] staple The accumulated staple - @param[in] arg Kernel argument - @param[in] gauge_mu Gauge/tensor field used for the parallel direction - @param[in] gauge_nu Gauge/tensor field used for the perpendicular direction - @param[in] x Full index array - @param[in,out] dx Full index offset array; leaves the function with the same values it entered with - @param[in] parity Parity index - @param[in] tensor_arg The {parallel, perpendicular} indices into the gauge/tensor fields - @param[in] shifts The {parallel, perpendicular} pair of directions for coordinate offsets - */ - template - __host__ __device__ inline void accumulateStaple(Matrix, Arg::nColor> &staple, const Arg &arg, - const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, - int x[], Coord &dx, int parity, int2 tensor_arg, int2 shifts) - { - // for readability - const int mu = shifts.x; - const int nu = shifts.y; - switch (mu) { - case 0: - switch (nu) { - case 1: accumulateStaple<0,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 2: accumulateStaple<0,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 3: accumulateStaple<0,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - } break; - case 1: - switch (nu) { - case 0: accumulateStaple<1,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 2: accumulateStaple<1,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 3: accumulateStaple<1,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - } break; - case 2: - switch (nu) { - case 0: accumulateStaple<2,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 1: accumulateStaple<2,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 3: accumulateStaple<2,3>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - } break; - case 3: - switch (nu) { - case 0: accumulateStaple<3,0>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 1: accumulateStaple<3,1>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - case 2: accumulateStaple<3,2>(staple, arg, gauge_mu, gauge_nu, x, dx, parity, tensor_arg); break; - } break; - } + return staple; } - template - __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], Coord &dx, int parity, + template + __host__ __device__ inline void computeStapleLevel1(const Arg &arg, int x[], thread_array &dx, int parity, int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); - int cnt = -1; + int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; ++nu) { // Identify directions orthogonal to the link and // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) - if (nu == mu) continue; - - cnt += 1; - - accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu }); - + if (nu != mu) { + Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu }); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + case 2: staple[2] = staple[2] + accum; break; + } + cnt++; + } } } - template - __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], Coord &dx, int parity, + template + __host__ __device__ inline void computeStapleLevel2(const Arg &arg, int x[], thread_array &dx, int parity, int mu, Matrix, Arg::nColor> staple[3]) { using Link = Matrix, Arg::nColor>; for (int i = 0; i < 3; ++i) staple[i] = Link(); - int cnt = -1; + int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) - if (nu == mu) continue; - - cnt += 1; - - for (int rho = 0; rho < 4; ++rho) { - if (rho == mu || rho == nu) continue; - int sigma = 0; - while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; - - const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); - const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + if (nu != mu) { +#pragma unroll + for (int rho = 0; rho < 4; ++rho) { + if (rho == mu || rho == nu) continue; + int sigma = 0; + while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; + + const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); + const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + + Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + case 2: staple[2] = staple[2] + accum; break; + } + } - accumulateStaple(staple[cnt], arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); + cnt++; } } } - template - __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], Coord &dx, int parity, - int mu, Matrix, Arg::nColor> staple[3]) + template + __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> &staple) { using Link = Matrix, Arg::nColor>; @@ -229,12 +195,12 @@ namespace quda // Identify directions orthogonal to the link and // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) - if (nu == mu) continue; + if (nu != mu) { + const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); + const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); - const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); - const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); - - accumulateStaple(staple[0], arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); + staple = staple + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); + } } } @@ -281,7 +247,7 @@ namespace quda arg.tmp[dir / 2 + 2](dir % 2 * 3 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 3) { - computeStapleLevel3(arg, x, dx, parity, dir, Stap); + computeStapleLevel3(arg, x, dx, parity, dir, Stap[0]); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)6.)); polarSu3(TestU, arg.tolerance); @@ -290,8 +256,8 @@ namespace quda } }; - template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], Coord &dx, int parity, + template + __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], thread_array &dx, int parity, int mu, Matrix, Arg::nColor> staple[2], const int dir_ignore) { using Link = Matrix, Arg::nColor>; @@ -300,37 +266,42 @@ namespace quda int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; ++nu) { + // Identify directions orthogonal to the link and + // ignore the dir_ignore direction (usually the temporal dim + // when used with STOUT or APE for measurement smearing) if (nu != mu && nu != dir_ignore) { - // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) - accumulateStaple(staple[cnt], arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); + Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + } cnt++; } } } - template - __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], Coord &dx, int parity, - int mu, Matrix, Arg::nColor> staple[2], int dir_ignore) + template + __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array &dx, int parity, + int mu, Matrix, Arg::nColor> &staple, int dir_ignore) { using Link = Matrix, Arg::nColor>; - staple[0] = Link(); + staple = Link(); #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) - if (nu == mu || nu == dir_ignore) continue; - - for (int rho = 0; rho < 4; ++rho) { - if (rho == mu || rho == nu || rho == dir_ignore) continue; - - const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); - const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); + if (nu != mu && nu != dir_ignore) { +#pragma unroll + for (int rho = 0; rho < 4; ++rho) { + if (rho != mu && rho != nu && rho != dir_ignore) { + const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); + const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); - accumulateStaple(staple[0], arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); + staple = staple + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); + } + } } } } @@ -371,7 +342,7 @@ namespace quda arg.tmp[0](dir_ * 2 + i, linkIndexShift(x, dx, arg.E), parity) = TestU * U; } } else if constexpr (Arg::level == 2) { - computeStaple3DLevel2(arg, x, dx, parity, dir, Stap, arg.dir_ignore); + computeStaple3DLevel2(arg, x, dx, parity, dir, Stap[0], arg.dir_ignore); TestU = I * (static_cast(1.0) - arg.alpha) + Stap[0] * conj(U) * (arg.alpha / ((real)4.)); polarSu3(TestU, arg.tolerance); From a3fae46a8413d615ded224e8f899db2f4012c366 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 12:53:34 -0800 Subject: [PATCH 15/22] Updated timers --- lib/gauge_hyp.cu | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 2090d19dd5..aea4723309 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -128,20 +128,40 @@ namespace quda if (dir_ignore == 4) { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha3, 1, dir_ignore); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false); + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha2, 2, dir_ignore); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false); tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false); + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha1, 3, dir_ignore); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + out.exchangeExtendedGhost(out.R(), false); } else { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha3, 1, dir_ignore); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); + + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha2, 2, dir_ignore); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + out.exchangeExtendedGhost(out.R(), false); } From d4a24e5422969d9406a9a6ebea1bd4b7c0df88e5 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 1 Feb 2024 14:00:19 -0800 Subject: [PATCH 16/22] clang-format --- include/gauge_tools.h | 3 +-- include/kernels/gauge_hyp.cuh | 42 ++++++++++++++++++++--------------- lib/gauge_hyp.cu | 12 +++++----- lib/interface_quda.cpp | 3 +-- 4 files changed, 31 insertions(+), 29 deletions(-) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 4c782d0ddc..f6bc3f22b9 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -129,8 +129,7 @@ namespace quda @param[in] alpha3 smearing parameter @param[in] dir_ignore ignored direction */ - void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, - double alpha3, int dir_ignore); + void HYPStep(GaugeField &dataDs, GaugeField &dataOr, double alpha1, double alpha2, double alpha3, int dir_ignore); /** @brief Apply Wilson Flow steps W1, W2, Vt to the gauge field. diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 9c18cabd59..217958c935 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -30,7 +30,7 @@ namespace quda const int dir_ignore; const Float tolerance; - GaugeHYPArg(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int dir_ignore) : + GaugeHYPArg(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int dir_ignore) : kernel_param(dim3(in.LocalVolumeCB(), 2, hypDim)), out(out), tmp {*tmp[0], *tmp[1], *tmp[2], *tmp[3]}, @@ -61,9 +61,9 @@ namespace quda @return The computed staple */ template - __host__ __device__ inline Matrix, Arg::nColor> accumulateStaple(const Arg &arg, - const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, - int x[], thread_array &dx, int parity, int2 tensor_arg, int2 shifts) + __host__ __device__ inline Matrix, Arg::nColor> + accumulateStaple(const Arg &arg, const typename Arg::Gauge &gauge_mu, const typename Arg::Gauge &gauge_nu, int x[], + thread_array &dx, int parity, int2 tensor_arg, int2 shifts) { using Link = Matrix, Arg::nColor>; Link staple; @@ -85,12 +85,12 @@ namespace quda /* load matrix B*/ dx[nu]++; - Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity); + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity); dx[nu]--; /* load matrix C*/ dx[mu]++; - Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity); + Link c = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity); dx[mu]--; staple = a * b * conj(c); @@ -106,10 +106,10 @@ namespace quda { /* load matrix A*/ dx[nu]--; - Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1-parity); + Link a = gauge_nu(tensor_arg.y, linkIndexShift(x, dx, arg.E), 1 - parity); /* load matrix B*/ - Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1-parity); + Link b = gauge_mu(tensor_arg.x, linkIndexShift(x, dx, arg.E), 1 - parity); /* load matrix C*/ dx[mu]++; @@ -117,7 +117,7 @@ namespace quda dx[mu]--; dx[nu]++; - staple = staple + conj(a)*b*c; + staple = staple + conj(a) * b * c; } return staple; @@ -137,7 +137,7 @@ namespace quda // ignore the dir_ignore direction (usually the temporal dim // when used with STOUT or APE for measurement smearing) if (nu != mu) { - Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, { mu, nu }, { mu, nu }); + Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); switch (cnt) { case 0: staple[0] = staple[0] + accum; break; case 1: staple[1] = staple[1] + accum; break; @@ -171,7 +171,8 @@ namespace quda const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); - Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, {sigma_with_mu, sigma_with_rho}, {mu, rho}); + Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, + {sigma_with_mu, sigma_with_rho}, {mu, rho}); switch (cnt) { case 0: staple[0] = staple[0] + accum; break; case 1: staple[1] = staple[1] + accum; break; @@ -199,7 +200,9 @@ namespace quda const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); - staple = staple + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, {mu, nu}); + staple = staple + + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, + {mu, nu}); } } } @@ -218,7 +221,7 @@ namespace quda int x[4]; getCoords(x, x_cb, arg.X, parity); #pragma unroll - for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates thread_array dx = {0, 0, 0, 0}; @@ -257,8 +260,9 @@ namespace quda }; template - __host__ __device__ inline void computeStaple3DLevel1(const Arg &arg, int x[], thread_array &dx, int parity, - int mu, Matrix, Arg::nColor> staple[2], const int dir_ignore) + __host__ __device__ inline void + computeStaple3DLevel1(const Arg &arg, int x[], thread_array &dx, int parity, int mu, + Matrix, Arg::nColor> staple[2], const int dir_ignore) { using Link = Matrix, Arg::nColor>; for (int i = 0; i < 2; ++i) staple[i] = Link(); @@ -282,7 +286,8 @@ namespace quda template __host__ __device__ inline void computeStaple3DLevel2(const Arg &arg, int x[], thread_array &dx, int parity, - int mu, Matrix, Arg::nColor> &staple, int dir_ignore) + int mu, Matrix, Arg::nColor> &staple, + int dir_ignore) { using Link = Matrix, Arg::nColor>; staple = Link(); @@ -299,7 +304,8 @@ namespace quda const int rho_with_nu = (nu - (nu > dir_ignore)) * 2 + rho - (rho > nu) - (rho > dir_ignore); const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); - staple = staple + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); + staple = staple + + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); } } } @@ -320,7 +326,7 @@ namespace quda int x[4]; getCoords(x, x_cb, arg.X, parity); #pragma unroll - for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates + for (int dr = 0; dr < 4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates thread_array dx = {0, 0, 0, 0}; diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index aea4723309..9097a9a3c3 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -10,7 +10,7 @@ namespace quda template class GaugeHYP : TunableKernel3D { GaugeField &out; - GaugeField* tmp[4]; + GaugeField *tmp[4]; const GaugeField ∈ const Float alpha; const int level; @@ -21,7 +21,7 @@ namespace quda public: // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim - GaugeHYP(GaugeField &out, GaugeField* tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : + GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), tmp {tmp[0], tmp[1], tmp[2], tmp[3]}, @@ -98,8 +98,7 @@ namespace quda }; // GaugeAPE - void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, - int dir_ignore) + void HYPStep(GaugeField &out, GaugeField &in, double alpha1, double alpha2, double alpha3, int dir_ignore) { checkPrecision(out, in); checkReconstruct(out, in); @@ -108,12 +107,12 @@ namespace quda GaugeFieldParam gParam(out); gParam.location = QUDA_CUDA_FIELD_LOCATION; const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4; - //GaugeField tmp[4]; + // GaugeField tmp[4]; gParam.geometry = QUDA_TENSOR_GEOMETRY; GaugeFieldParam gParam2(gParam); gParam2.create = QUDA_REFERENCE_FIELD_CREATE; - GaugeField* tmp[4]; + GaugeField *tmp[4]; if (smearDim == 3) { tmp[0] = new GaugeField(gParam); // aux[1], aux[2] and aux[3] will not be used for smearDim == 3 @@ -166,7 +165,6 @@ namespace quda } for (int i = 0; i < 4; i++) delete tmp[i]; - } } // namespace quda \ No newline at end of file diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 073ddb6f63..b44d4c534b 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5094,8 +5094,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); break; case QUDA_GAUGE_SMEAR_HYP: - HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, - smear_param->dir_ignore); + HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } From 672177670ebb583c9b3f10d79f583291625917a9 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 5 Feb 2024 09:21:45 -0800 Subject: [PATCH 17/22] Clang error: unused "using" --- include/kernels/gauge_hyp.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 9c18cabd59..a092b2ec0e 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -188,8 +188,6 @@ namespace quda __host__ __device__ inline void computeStapleLevel3(const Arg &arg, int x[], thread_array &dx, int parity, int mu, Matrix, Arg::nColor> &staple) { - using Link = Matrix, Arg::nColor>; - #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and From a2326f87e7680c4679a0c09ba22980ecffd7fb16 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 5 Feb 2024 09:27:26 -0800 Subject: [PATCH 18/22] Removed superfluous getProfile calls --- lib/gauge_hyp.cu | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index aea4723309..0e7e79c4fa 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -125,45 +125,27 @@ namespace quda if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (dir_ignore == 4) { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); - - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha3, 1, dir_ignore); - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); - tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); tmp[1]->exchangeExtendedGhost(tmp[1]->R(), false); - - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha2, 2, dir_ignore); - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); - tmp[2]->exchangeExtendedGhost(tmp[2]->R(), false); tmp[3]->exchangeExtendedGhost(tmp[3]->R(), false); - - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha1, 3, dir_ignore); - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); - out.exchangeExtendedGhost(out.R(), false); } else { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); - - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha3, 1, dir_ignore); - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); - tmp[0]->exchangeExtendedGhost(tmp[0]->R(), false); - - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, tmp, in, alpha2, 2, dir_ignore); - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); - out.exchangeExtendedGhost(out.R(), false); } + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); for (int i = 0; i < 4; i++) delete tmp[i]; From 0892e264aae823da98f06e12c1e9cf21c333438a Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Tue, 6 Feb 2024 14:45:06 +0800 Subject: [PATCH 19/22] Remove unused param to make compiler happy. --- lib/gauge_hyp.cu | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 9aa2995b9f..051ea2962b 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -104,25 +104,22 @@ namespace quda checkReconstruct(out, in); checkNative(out, in); + if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } + GaugeFieldParam gParam(out); gParam.location = QUDA_CUDA_FIELD_LOCATION; - const int smearDim = (dir_ignore >= 0 && dir_ignore <= 3) ? 3 : 4; - // GaugeField tmp[4]; gParam.geometry = QUDA_TENSOR_GEOMETRY; - GaugeFieldParam gParam2(gParam); - gParam2.create = QUDA_REFERENCE_FIELD_CREATE; GaugeField *tmp[4]; - if (smearDim == 3) { + if (dir_ignore == 4) { // hypDim == 4 + for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } + } else { // hypDim == 3 tmp[0] = new GaugeField(gParam); - // aux[1], aux[2] and aux[3] will not be used for smearDim == 3 + // tmp[1], tmp[2] and tmp[3] will not be used for hypDim == 3 gParam.create = QUDA_REFERENCE_FIELD_CREATE; for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } - } else { - for (int i = 0; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } } - if (dir_ignore < 0 || dir_ignore > 3) { dir_ignore = 4; } getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (dir_ignore == 4) { From 16889318aecced86cbac1f4c467af978784f3a9a Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Tue, 6 Feb 2024 15:12:07 +0800 Subject: [PATCH 20/22] clang-14 + nvcc-11.8 think variables like `sigma_with_mu` unused. They might not happy with implicit conversion to `int2`. --- include/kernels/gauge_hyp.cuh | 8 ++++---- lib/gauge_hyp.cu | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 574a35d3f4..22843fce09 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -172,7 +172,7 @@ namespace quda const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, - {sigma_with_mu, sigma_with_rho}, {mu, rho}); + int2 {sigma_with_mu, sigma_with_rho}, {mu, rho}); switch (cnt) { case 0: staple[0] = staple[0] + accum; break; case 1: staple[1] = staple[1] + accum; break; @@ -199,8 +199,8 @@ namespace quda const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); staple = staple - + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, {nu_with_mu, mu_with_nu}, - {mu, nu}); + + accumulateStaple(arg, arg.tmp[mu / 2 + 2], arg.tmp[nu / 2 + 2], x, dx, parity, + int2 {nu_with_mu, mu_with_nu}, {mu, nu}); } } } @@ -303,7 +303,7 @@ namespace quda const int rho_with_mu = (mu - (mu > dir_ignore)) * 2 + rho - (rho > mu) - (rho > dir_ignore); staple = staple - + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, {rho_with_mu, rho_with_nu}, {mu, nu}); + + accumulateStaple(arg, arg.tmp[0], arg.tmp[0], x, dx, parity, int2 {rho_with_mu, rho_with_nu}, {mu, nu}); } } } diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 051ea2962b..142459d0cb 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -120,7 +120,6 @@ namespace quda for (int i = 1; i < 4; ++i) { tmp[i] = new GaugeField(gParam); } } - getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (dir_ignore == 4) { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); From 74f9a236a063f32955d262d0ea67073fdee54629 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Tue, 6 Feb 2024 15:38:10 +0800 Subject: [PATCH 21/22] Update comments. --- include/kernels/gauge_hyp.cuh | 42 +++++++++++++---------------------- include/quda.h | 6 ++--- lib/gauge_ape.cu | 2 +- lib/gauge_hyp.cu | 2 +- lib/gauge_stout.cu | 2 +- 5 files changed, 22 insertions(+), 32 deletions(-) diff --git a/include/kernels/gauge_hyp.cuh b/include/kernels/gauge_hyp.cuh index 22843fce09..c444f86169 100644 --- a/include/kernels/gauge_hyp.cuh +++ b/include/kernels/gauge_hyp.cuh @@ -133,9 +133,6 @@ namespace quda int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; ++nu) { - // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) if (nu != mu) { Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); switch (cnt) { @@ -158,25 +155,23 @@ namespace quda int cnt = 0; #pragma unroll for (int nu = 0; nu < 4; nu++) { - // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) if (nu != mu) { #pragma unroll for (int rho = 0; rho < 4; ++rho) { - if (rho == mu || rho == nu) continue; - int sigma = 0; - while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; - - const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); - const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); - - Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, - int2 {sigma_with_mu, sigma_with_rho}, {mu, rho}); - switch (cnt) { - case 0: staple[0] = staple[0] + accum; break; - case 1: staple[1] = staple[1] + accum; break; - case 2: staple[2] = staple[2] + accum; break; + if (rho != mu && rho != nu) { + int sigma = 0; + while (sigma == mu || sigma == nu || sigma == rho) sigma += 1; + + const int sigma_with_rho = rho % 2 * 3 + sigma - (sigma > rho); + const int sigma_with_mu = mu % 2 * 3 + sigma - (sigma > mu); + + Link accum = accumulateStaple(arg, arg.tmp[mu / 2], arg.tmp[rho / 2], x, dx, parity, + int2 {sigma_with_mu, sigma_with_rho}, {mu, rho}); + switch (cnt) { + case 0: staple[0] = staple[0] + accum; break; + case 1: staple[1] = staple[1] + accum; break; + case 2: staple[2] = staple[2] + accum; break; + } } } @@ -191,9 +186,6 @@ namespace quda { #pragma unroll for (int nu = 0; nu < 4; nu++) { - // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) if (nu != mu) { const int mu_with_nu = nu % 2 * 3 + mu - (mu > nu); const int nu_with_mu = mu % 2 * 3 + nu - (nu > mu); @@ -269,8 +261,7 @@ namespace quda #pragma unroll for (int nu = 0; nu < 4; ++nu) { // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) + // ignore the dir_ignore direction (usually the temporal dim) if (nu != mu && nu != dir_ignore) { Link accum = accumulateStaple(arg, arg.in, arg.in, x, dx, parity, {mu, nu}, {mu, nu}); switch (cnt) { @@ -293,8 +284,7 @@ namespace quda #pragma unroll for (int nu = 0; nu < 4; nu++) { // Identify directions orthogonal to the link and - // ignore the dir_ignore direction (usually the temporal dim - // when used with STOUT or APE for measurement smearing) + // ignore the dir_ignore direction (usually the temporal dim) if (nu != mu && nu != dir_ignore) { #pragma unroll for (int rho = 0; rho < 4; ++rho) { diff --git a/include/quda.h b/include/quda.h index 4469481d86..2a5bffa13a 100644 --- a/include/quda.h +++ b/include/quda.h @@ -846,9 +846,9 @@ extern "C" { Wilson/Symanzik flow */ double alpha; /**< The single coefficient used in APE smearing */ double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ - double alpha1; /**< The first coefficient used in HYP smearing */ - double alpha2; /**< The second coefficient used in HYP smearing */ - double alpha3; /**< The third coefficient used in HYP smearing */ + double alpha1; /**< The coefficient used in HYP smearing step 3 (will not be used in 3D smearing)*/ + double alpha2; /**< The coefficient used in HYP smearing step 2*/ + double alpha3; /**< The coefficient used in HYP smearing step 1*/ unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ QudaGaugeSmearType smear_type; /**< The smearing type to perform */ QudaBoolean restart; /**< Used to restart the smearing from existing gaugeSmeared */ diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index 8d61066041..2174e3f9b2 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -17,7 +17,7 @@ namespace quda { unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: - // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim GaugeAPE(GaugeField &out, const GaugeField &in, double alpha, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), diff --git a/lib/gauge_hyp.cu b/lib/gauge_hyp.cu index 142459d0cb..4b2c1e3a65 100644 --- a/lib/gauge_hyp.cu +++ b/lib/gauge_hyp.cu @@ -20,7 +20,7 @@ namespace quda unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: - // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim GaugeHYP(GaugeField &out, GaugeField *tmp[4], const GaugeField &in, double alpha, int level, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index 2cf1c4e634..b3433784ce 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -25,7 +25,7 @@ namespace quda { } public: - // (2,3): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim + // (2,3/4): 2 for parity in the y thread dim, 3 or 4 corresponds to mapping direction to the z thread dim GaugeSTOUT(GaugeField &out, const GaugeField &in, bool improved, double rho, double epsilon, int dir_ignore) : TunableKernel3D(in, 2, (dir_ignore == 4) ? 4 : 3), out(out), From 2b5a7c2f0f0a4e0f22654ae18aeaf064b54e038f Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 7 Feb 2024 02:50:27 +0800 Subject: [PATCH 22/22] Use `dir_ignore < 0` to represent the default situation. 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP. --- include/quda.h | 3 ++- lib/check_params.h | 2 +- lib/interface_quda.cpp | 15 +++++++++++---- tests/su3_test.cpp | 7 ++++--- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/quda.h b/include/quda.h index 2a5bffa13a..de88d1d0d2 100644 --- a/include/quda.h +++ b/include/quda.h @@ -853,7 +853,8 @@ extern "C" { QudaGaugeSmearType smear_type; /**< The smearing type to perform */ QudaBoolean restart; /**< Used to restart the smearing from existing gaugeSmeared */ double t0; /**< Starting flow time for Wilson flow */ - int dir_ignore; /**< The direction to be ignored by the smearing algorithm */ + int dir_ignore; /**< The direction to be ignored by the smearing algorithm + A negative value means 3D for APE/STOUT and 4D for OVRIMP_STOUT/HYP */ } QudaGaugeSmearParam; typedef struct QudaBLASParam_s { diff --git a/lib/check_params.h b/lib/check_params.h index fc4d80d315..86800b0e7e 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -1105,7 +1105,7 @@ void printQudaGaugeSmearParam(QudaGaugeSmearParam *param) P(alpha1, 0.0); P(alpha2, 0.0); P(alpha3, 0.0); - P(dir_ignore, 4); + P(dir_ignore, -1); #else P(n_steps, (unsigned int)INVALID_INT); P(meas_interval, (unsigned int)INVALID_INT); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index b44d4c534b..f8c2fcda74 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -5086,15 +5086,22 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable gaugeObservablesQuda(&obs_param[measurement_n]); logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); + // set default dir_ignore = 3 for APE and STOUT for compatibility + int dir_ignore = smear_param->dir_ignore; + if (dir_ignore < 0 + && (smear_param->smear_type == QUDA_GAUGE_SMEAR_APE || smear_param->smear_type == QUDA_GAUGE_SMEAR_STOUT)) { + dir_ignore = 3; + } + for (unsigned int i = 0; i < smear_param->n_steps; i++) { switch (smear_param->smear_type) { - case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, smear_param->dir_ignore); break; - case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->dir_ignore); break; + case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha, dir_ignore); break; + case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho, dir_ignore); break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: - OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, smear_param->dir_ignore); + OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon, dir_ignore); break; case QUDA_GAUGE_SMEAR_HYP: - HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, smear_param->dir_ignore); + HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index acd2803b42..265fed1248 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -27,7 +27,7 @@ double gauge_smear_alpha2 = 0.6; double gauge_smear_alpha3 = 0.3; int gauge_smear_steps = 50; QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; -int gauge_smear_dir_ignore = 4; +int gauge_smear_dir_ignore = -1; int measurement_interval = 5; bool su_project = true; @@ -98,8 +98,9 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - opgroup->add_option("--su3-smear-dir-ignore", gauge_smear_dir_ignore, - "direction to be ignored by the smearing (default 4)"); + opgroup->add_option( + "--su3-smear-dir-ignore", gauge_smear_dir_ignore, + "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)");