diff --git a/include/blas_3d.h b/include/blas_3d.h new file mode 100644 index 0000000000..2616a8ac03 --- /dev/null +++ b/include/blas_3d.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include + +namespace quda +{ + + namespace blas3d + { + + // Local enum for the 3D copy type + enum class copyType { COPY_TO_3D, COPY_FROM_3D, SWAP_3D }; + + /** + @brief Extract / insert / swap a timeslice between a 4-d field and a 3-d field + @param[in] slice Which slice + @param[in] type Extracting a time slice (COPY_TO_3D) or + inserting a timeslice (COPY_FROM_3D) or swapping time slices + (SWAP_3D) + @param[in,out] x 3-d field + @param[in,out] y 4-d field + */ + void copy(int slice, copyType type, ColorSpinorField &x, ColorSpinorField &y); + + /** + @brief Swap the slice in two given fields + @param[in] slice The slice we wish to swap in the fields + @param[in,out] x Field whose slice we wish to swap + @param[in,out] y Field whose slice we wish to swap + */ + void swap(int slice, ColorSpinorField &x, ColorSpinorField &y); + + /** + @brief Compute a set of real-valued inner products , where each inner + product is restricted to a timeslice. + @param[out] result Vector of spatial inner products + @param[in] x Left vector field + @param[in] y Right vector field + */ + void reDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y); + + /** + @brief Compute a set of complex-valued inner products , where each inner + product is restricted to a timeslice. + @param[out] result Vector of spatial inner products + @param[in] x Left vector field + @param[in] y Right vector field + */ + void cDotProduct(std::vector &result, const ColorSpinorField &a, const ColorSpinorField &b); + + /** + @brief Timeslice real-valued scaling of the field + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Field we we wish to scale + */ + void ax(std::vector &a, ColorSpinorField &x); + + /** + @brief Timeslice real-valued axpby computation + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Input field + @param[in] b Vector of scale factors (length = local temporary extent) + @param[in,out] y Field we are updating + */ + void axpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y); + + /** + @brief Timeslice complex-valued axpby computation + @param[in] a Vector of scale factors (length = local temporary extent) + @param[in] x Input field + @param[in] b Vector of scale factors (length = local temporary extent) + @param[in,out] y Field we are updating + */ + void caxpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y); + + } // namespace blas3d +} // namespace quda diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 059329507b..d8980afc75 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -280,6 +280,8 @@ namespace quda //! for deflation etc. if (is_composite) printfQuda("Number of elements = %d\n", composite_dim); } + + void change_dim(int D, int d) { x[D] = d; } }; struct DslashConstant; @@ -1054,6 +1056,38 @@ namespace quda */ void spinorDistanceReweight(ColorSpinorField &src, double alpha0, int t0); + /** + @brief Helper function for determining if the spin of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @return If spin is unique return the number of spins + */ + inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b) + { + int nSpin = 0; + if (a.Nspin() == b.Nspin()) + nSpin = a.Nspin(); + else + errorQuda("Spin %d %d do not match (%s:%d in %s())", a.Nspin(), b.Nspin(), file, line, func); + return nSpin; + } + + /** + @brief Helper function for determining if the spin of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @param[in] args List of additional fields to check spin on + @return If spins is unique return the number of spins + */ + template + inline int Spin_(const char *func, const char *file, int line, const ColorSpinorField &a, const ColorSpinorField &b, + const Args &...args) + { + return Spin_(func, file, line, a, b) & Spin_(func, file, line, a, args...); + } + +#define checkSpin(...) Spin_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** @brief Helper function for determining if the preconditioning type of the fields is the same. diff --git a/include/comm_key.h b/include/comm_key.h index 6618068bb0..a9682ddc03 100644 --- a/include/comm_key.h +++ b/include/comm_key.h @@ -1,27 +1,33 @@ #pragma once +#include + namespace quda { struct CommKey { static constexpr int n_dim = 4; + array key = {0, 0, 0, 0}; - int array[n_dim] = {0, 0, 0, 0}; + constexpr int product() { return key[0] * key[1] * key[2] * key[3]; } - constexpr inline int product() { return array[0] * array[1] * array[2] * array[3]; } + constexpr int &operator[](int d) { return key[d]; } - constexpr inline int &operator[](int d) { return array[d]; } + constexpr const int &operator[](int d) const { return key[d]; } - constexpr inline const int &operator[](int d) const { return array[d]; } + constexpr auto data() { return key.data; } - constexpr inline int *data() { return array; } + constexpr auto data() const { return key.data; } - constexpr inline const int *data() const { return array; } + constexpr bool is_valid() const { return (key[0] > 0) && (key[1] > 0) && (key[2] > 0) && (key[3] > 0); } - constexpr inline bool is_valid() const + bool operator==(const CommKey &other) const { - return (array[0] > 0) && (array[1] > 0) && (array[2] > 0) && (array[3] > 0); + bool is_same = true; + for (auto i = 0; i < n_dim; i++) + if (key[i] != other.key[i]) is_same = false; + return is_same; } }; diff --git a/include/comm_quda.h b/include/comm_quda.h index 7d3bdfde19..39563c84f9 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -50,12 +50,26 @@ namespace quda int comm_dim(int dim); /** - Return the coording of this process in the dimension dim + Return the global number of processes in the dimension dim + @param dim Dimension which we are querying + @return Length of process dimensions + */ + int comm_dim_global(int dim); + + /** + Return the coordinate of this process in the dimension dim @param dim Dimension which we are querying @return Coordinate of this process */ int comm_coord(int dim); + /** + Return the global coordinates of this process in the dimension dim + @param dim Dimension which we are querying + @return Coordinate of this process + */ + int comm_coord_global(int dim); + /** * Declare a message handle for sending `nbytes` to the `rank` with `tag`. */ diff --git a/include/dslash_quda.h b/include/dslash_quda.h index 38efe07e6c..32111e92f9 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -773,9 +773,12 @@ namespace quda @param[in] a Scale factor applied to derivative @param[in] b Scale factor applied to aux field @param[in] x Vector field we accumulate onto to + @param[in] parity Destination parity + @param[in] comm_override Override for which dimensions are partitioned + @param[in] profile The TimeProfile used for profiling the dslash */ void ApplyLaplace(cvector_ref &out, cvector_ref &in, const GaugeField &U, - int dir, double a, double b, cvector_ref &x, int parity, bool dagger, + int dir, double a, double b, cvector_ref &x, int parity, const int *comm_override, TimeProfile &profile); /** diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index ee01a51b32..4fcea4b3d3 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -148,7 +148,7 @@ namespace quda @param[in] out Output spinor @param[in] in Input spinor */ - double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); + virtual double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); /** @brief Orthogonalise input vectors r against @@ -263,22 +263,10 @@ namespace quda @brief Compute eigenvalues and their residiua @param[in] mat Matrix operator @param[in] evecs The eigenvectors - @param[in] evals The eigenvalues + @param[in,out] evals The eigenvalues @param[in] size The number of eigenvalues to compute */ - void computeEvals(std::vector &evecs, std::vector &evals, - int size); - - /** - @brief Compute eigenvalues and their residiua. This variant compute the number of converged eigenvalues. - @param[in] mat Matrix operator - @param[in] evecs The eigenvectors - @param[in] evals The eigenvalues - */ - void computeEvals(std::vector &evecs, std::vector &evals) - { - computeEvals(evecs, evals, n_conv); - } + virtual void computeEvals(std::vector &evecs, std::vector &evals, int size = 0); /** @brief Load and check eigenpairs from file @@ -461,6 +449,116 @@ namespace quda void computeBlockKeptRitz(std::vector &kSpace); }; + /** + @brief Thick Restarted Lanczos Method for 3D slices + */ + class TRLM3D : public EigenSolver + { + bool verbose_rank = false; /** Whether this rank is one that logs */ + + public: + /** + @brief Constructor for Thick Restarted Eigensolver class + @param eig_param The eigensolver parameters + @param mat The operator to solve + */ + TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param); + + /** + @return Whether the solver is only for Hermitian systems + */ + virtual bool hermitian() override { return true; } /** TRLM3D is only for Hermitian systems */ + + // Variable size matrix (for the 3D problem) + std::vector> ritz_mat_3D; + + // Arrays for 3D residua + std::vector> residua_3D; + + // Array for convergence + std::vector converged_3D; + std::vector active_3D; + std::vector iter_locked_3D; + std::vector iter_keep_3D; + std::vector iter_converged_3D; + std::vector num_locked_3D; + std::vector num_keep_3D; + std::vector num_converged_3D; + + // Tridiagonal/Arrow matrices, fixed size (for the 3D problem) + std::vector> alpha_3D; + std::vector> beta_3D; + + // The orthogonal direction and size in the 3D problem + int ortho_dim; + int ortho_dim_size; + + /** + @brief Compute eigenpairs + @param[in] kSpace Krylov vector space + @param[in] evals Computed eigenvalues + */ + void operator()(std::vector &kSpace, std::vector &evals) override; + + /** + @brief Lanczos step: extends the Krylov space. + @param[in] v Vector space + @param[in] j Index of vector being computed + */ + void lanczosStep3D(std::vector &v, int j); + + /** + @brief Reorder the Krylov space by eigenvalue + @param[in] kSpace the Krylov space + */ + void reorder3D(std::vector &kSpace); + + /** + @brief Get the eigendecomposition from the arrow matrix + */ + void eigensolveFromArrowMat3D(); + + /** + @brief Rotate the Ritz vectors using the arrow matrix eigendecomposition + @param[in] nKspace current Krylov space + */ + void computeKeptRitz3D(std::vector &kSpace); + + /** + @brief Orthogonalise input vectors r against + vector space v using block-BLAS + @param[in] v Vector space + @param[in] r Vectors to be orthogonalised + @param[in] j Use vectors v[0:j] + @param[in] s array of + */ + void blockOrthogonalize3D(std::vector &v, std::vector &r, int j); + + /** + @brief Check for an initial guess. If none present, populate with rands, then + orthonormalise + @param[in] kSpace The Krylov space vectors + */ + void prepareInitialGuess3D(std::vector &kSpace, int ortho_dim_size); + + /** + @brief Estimate the spectral radius of the operator for the max value of the + Chebyshev polynomial + @param[in] mat Matrix operator + @param[in] out Output spinor + @param[in] in Input spinor + */ + double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) override; + + /** + @brief Compute eigenvalues and their residiua + @param[in] evecs The eigenvectors + @param[in,out] evals The eigenvalues + @param[in] size The number of eigenvalues to compute + */ + void computeEvals(std::vector &evecs, std::vector &evals, int size = 0) override; + }; + /** @brief Implicitly Restarted Arnoldi Method. */ diff --git a/include/enum_quda.h b/include/enum_quda.h index f281d33feb..462ce98cf0 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -138,6 +138,7 @@ typedef enum QudaInverterType_s { typedef enum QudaEigType_s { QUDA_EIG_TR_LANCZOS, // Thick restarted lanczos solver QUDA_EIG_BLK_TR_LANCZOS, // Block Thick restarted lanczos solver + QUDA_EIG_TR_LANCZOS_3D, // Thick restarted lanczos solver for 3-d systems QUDA_EIG_IR_ARNOLDI, // Implicitly Restarted Arnoldi solver QUDA_EIG_BLK_IR_ARNOLDI, // Block Implicitly Restarted Arnoldi solver QUDA_EIG_INVALID = QUDA_INVALID_ENUM @@ -277,8 +278,6 @@ typedef enum QudaVerbosity_s { QUDA_INVALID_VERBOSITY = QUDA_INVALID_ENUM } QudaVerbosity; -typedef enum QudaTune_s { QUDA_TUNE_NO, QUDA_TUNE_YES, QUDA_TUNE_INVALID = QUDA_INVALID_ENUM } QudaTune; - typedef enum QudaPreserveDirac_s { QUDA_PRESERVE_DIRAC_NO, QUDA_PRESERVE_DIRAC_YES, diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 596232a7b3..8874959d7b 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -122,8 +122,9 @@ #define QudaEigType integer(4) #define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver -#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver -#define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented) +#define QUDA_EIG_TR_LANCZOS_3D 2 // Thick Restarted Lanczos Solver for 3-d systems +#define QUDA_EIG_IR_ARNOLDI 3 // Implicitly restarted Arnoldi solver +#define QUDA_EIG_BLK_IR_ARNOLDI 4 // Block Implicitly restarted Arnoldi solver (not yet implemented) #define QUDA_EIG_INVALID QUDA_INVALID_ENUM #define QudaEigSpectrumType integer(4) @@ -247,11 +248,6 @@ #define QUDA_DEBUG_VERBOSE 3 #define QUDA_INVALID_VERBOSITY QUDA_INVALID_ENUM -#define QudaTune integer(4) -#define QUDA_TUNE_NO 0 -#define QUDA_TUNE_YES 1 -#define QUDA_TUNE_INVALID QUDA_INVALID_ENUM - #define QudaPreserveDirac integer(4) #define QUDA_PRESERVE_DIRAC_NO 0 #define QUDA_PRESERVE_DIRAC_YES 1 diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 990ff949e7..cf1faf72b0 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1106,4 +1106,26 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } + /** + @brief Compute the flattened 4-d index from a separate T and + flattened 3-d XYZ index. + @param[in] t Temporal index + @param[in] xyz Flattened 3-d index + @param[in] X Lattice dimensions + @return The flattened 4-d index + */ + template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) + { + int x[4]; +#pragma unroll + for (int d = 0; d < 4; d++) { + if (d != reduction_dim) { + x[d] = xyz % X[d]; + xyz = xyz / X[d]; + } + } + x[reduction_dim] = t; + return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); + } + } // namespace quda diff --git a/include/kernels/blas_3d.cuh b/include/kernels/blas_3d.cuh new file mode 100644 index 0000000000..e7426e89af --- /dev/null +++ b/include/kernels/blas_3d.cuh @@ -0,0 +1,331 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace quda +{ + + struct baseArg : kernel_param<> { + int_fastdiv X[4]; // grid dimensions + + baseArg(dim3 threads, const ColorSpinorField &x) : kernel_param(threads) + { + for (int i = 0; i < 4; i++) { X[i] = x.X()[i]; } + if (x.SiteSubset() == 1) X[0] = 2 * X[0]; + } + }; + + template struct copy3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F y; + F x; + const int slice; + + copy3dArg(ColorSpinorField &y, ColorSpinorField &x, int slice) : + baseArg(dim3(y.VolumeCB(), y.SiteSubset(), 1), y), y(y), x(x), slice(slice) + { + } + }; + + template struct copyTo3d { + const Arg &arg; + constexpr copyTo3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + // Isolate the slice of the 4D array + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + + // Get 4D data + Vector y = arg.y(x_cb, parity); + // Get 3D location + int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]); + + // Write to 3D + arg.x(xyz / 2, xyz % 2) = y; + } + } + }; + + template struct copyFrom3d { + const Arg &arg; + constexpr copyFrom3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + int idx[4] = {}; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + + // Get 3D location + int xyz = ((idx[2] * arg.X[1] + idx[1]) * arg.X[0] + idx[0]); + Vector x = arg.x(xyz / 2, xyz % 2); + // Write to 4D + arg.y(x_cb, parity) = x; + } + } + }; + + template struct swap3d { + const Arg &arg; + constexpr swap3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + int idx[4] = {}; + getCoords(idx, x_cb, arg.X, parity); + + if (idx[3] == arg.slice) { + using Vector = ColorSpinor; + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + arg.x(x_cb, parity) = y; + arg.y(x_cb, parity) = x; + } + } + }; + + template struct axpby3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + static constexpr bool disable_ghost = true; + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + static constexpr int MAX_ORTHO_DIM = 256; + real a[MAX_ORTHO_DIM]; + const F x; + real b[MAX_ORTHO_DIM]; + F y; + + axpby3dArg(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) : + baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y) + { + if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM); + for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i]; + for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i]; + } + }; + + template struct axpby3d { + const Arg &arg; + constexpr axpby3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + + // Get the coeffs for this dim slice + real a = arg.a[idx[3]]; + real b = arg.b[idx[3]]; + + // Compute the axpby + Vector out = a * x + b * y; + + // Write to y + arg.y(x_cb, parity) = out; + } + }; + + template struct caxpby3dArg : baseArg { + using real = typename mapper::type; + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + static constexpr bool disable_ghost = true; + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + static constexpr int MAX_ORTHO_DIM = 256; + complex a[MAX_ORTHO_DIM]; + const F x; + complex b[MAX_ORTHO_DIM]; + F y; + + caxpby3dArg(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) : + baseArg(dim3(x.VolumeCB(), x.SiteSubset(), 1), x), x(x), y(y) + { + if (x.X(3) > MAX_ORTHO_DIM) errorQuda("Orthogonal dimension %d exceeds maximum %d", x.X(3), MAX_ORTHO_DIM); + for (auto i = 0u; i < a.size(); i++) this->a[i] = a[i]; + for (auto i = 0u; i < b.size(); i++) this->b[i] = b[i]; + } + }; + + template struct caxpby3d { + const Arg &arg; + constexpr caxpby3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline void operator()(int x_cb, int parity) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + int idx[4]; + getCoords(idx, x_cb, arg.X, parity); + + Vector x = arg.x(x_cb, parity); + Vector y = arg.y(x_cb, parity); + + // Get the coeffs for this dim slice + complex a = arg.a[idx[3]]; + complex b = arg.b[idx[3]]; + + // Compute the caxpby + Vector out = a * x + b * y; + + // Write to y + arg.y(x_cb, parity) = out; + } + }; + + template struct reDotProduct3dArg : public ReduceArg { + using real = typename mapper::type; + static constexpr int reduction_dim = 3; // DMH Template this + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F x; + F y; + int_fastdiv Xh[4]; // checkerboard grid dimensions + + reDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) : + ReduceArg(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]), + x.X()[reduction_dim]), + x(x), + y(y) + { + for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i]; + } + + __device__ __host__ double init() const { return double(); } + }; + + template struct reDotProduct3d : plus { + using reduce_t = double; + using plus::operator(); + static constexpr int reduce_block_dim = 2; + const Arg &arg; + constexpr reDotProduct3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + // Collect vector data + int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh); + + Vector x = arg.x(idx_cb, parity); + Vector y = arg.y(idx_cb, parity); + + // Get the inner product + reduce_t sum = innerProduct(x, y).real(); + + // Apply reduction to t bucket + return plus::operator()(sum, result); + } + }; + + template struct cDotProduct3dArg : public ReduceArg> { + using real = typename mapper::type; + static constexpr int reduction_dim = 3; // DMH Template this + static constexpr int nColor = nColor_; + static constexpr int nSpin = 1; + static constexpr bool spin_project = false; + static constexpr bool spinor_direct_load = false; // false means texture load + + // Create a typename F for the ColorSpinorFields + typedef typename colorspinor_mapper::type F; + + F x; + F y; + int_fastdiv X[4]; // grid dimensions + int_fastdiv Xh[4]; // checkerboard grid dimensions + + cDotProduct3dArg(const ColorSpinorField &x, const ColorSpinorField &y) : + ReduceArg>(dim3(x.VolumeCB() / x.X()[reduction_dim], x.SiteSubset(), x.X()[reduction_dim]), + x.X()[reduction_dim]), + x(x), + y(y) + { + for (int i = 0; i < 4; i++) Xh[i] = x.SiteSubset() == 2 && i == 0 ? x.X()[i] / 2 : x.X()[i]; + } + + __device__ __host__ reduce_t init() const { return {0.0, 0.0}; } + }; + + template struct cDotProduct3d : plus> { + using reduce_t = array; + using plus::operator(); + static constexpr int reduce_block_dim = 2; + + const Arg &arg; + constexpr cDotProduct3d(const Arg &arg) : arg(arg) { } + static constexpr const char *filename() { return KERNEL_FILE; } + + __device__ __host__ inline reduce_t operator()(reduce_t &result, int xyz, int parity, int t) + { + using real = typename Arg::real; + using Vector = ColorSpinor; + + // Collect vector data + int idx_cb = idx_from_t_xyz<3>(t, xyz, arg.Xh); + + Vector x = arg.x(idx_cb, parity); + Vector y = arg.y(idx_cb, parity); + + // Get the inner product + complex res = innerProduct(x, y); + + // Apply reduction to temporal bucket + return plus::operator()({res.real(), res.imag()}, result); + } + }; +} // namespace quda diff --git a/include/kernels/contraction.cuh b/include/kernels/contraction.cuh index 96079ed041..815da471d2 100644 --- a/include/kernels/contraction.cuh +++ b/include/kernels/contraction.cuh @@ -32,20 +32,6 @@ namespace quda return ((sink[3] * X[2] + sink[2]) * X[1] + sink[1]) * X[0] + sink[0]; } - template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) - { - int x[4]; -#pragma unroll - for (int d = 0; d < 4; d++) { - if (d != reduction_dim) { - x[d] = xyz % X[d]; - xyz /= X[d]; - } - } - x[reduction_dim] = t; - return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); - } - template struct ContractionSummedArg : public ReduceArg { using reduce_t = contract_t; diff --git a/include/kernels/copy_field_offset.cuh b/include/kernels/copy_field_offset.cuh index ea4a603bfd..5f614b2e49 100644 --- a/include/kernels/copy_field_offset.cuh +++ b/include/kernels/copy_field_offset.cuh @@ -142,12 +142,12 @@ namespace quda if (arg.mode == QudaOffsetCopyMode::COLLECT) { // we are collecting so x_cb is the index for the input. idx_in = x_cb; - getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.array); + getCoordsExtended(coordinate, x_cb, arg.dim_in, parity, arg.offset.data()); idx_out = linkIndex(coordinate, arg.dim_out); } else { // we are dispersing so x_cb is the index for the output. idx_out = x_cb; - getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.array); + getCoordsExtended(coordinate, x_cb, arg.dim_out, parity, arg.offset.data()); idx_in = linkIndex(coordinate, arg.dim_in); } copy_field(s * arg.volume_4d_cb_out + idx_out, s * arg.volume_4d_cb_in + idx_in, parity, arg); diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh index a68187b1d4..a756fe43c3 100644 --- a/include/kernels/dslash_domain_wall_m5.cuh +++ b/include/kernels/dslash_domain_wall_m5.cuh @@ -578,7 +578,8 @@ namespace quda for (int s_count = 0; s_count < arg.Ls; s_count++) { auto factorR = (s_ < s ? -arg.m_f * R : R); - Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); + Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : + arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); r += factorR * in.project(4, proj_dir); R *= coeff.kappa(s); @@ -597,7 +598,8 @@ namespace quda for (int s_count = 0; s_count < arg.Ls; s_count++) { auto factorL = (s_ > s ? -arg.m_f * L : L); - Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); + Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) : + arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); l += factorL * in.project(4, proj_dir); L *= coeff.kappa(s); diff --git a/include/kernels/evec_project.cuh b/include/kernels/evec_project.cuh index 7bea55d908..add4b7caba 100644 --- a/include/kernels/evec_project.cuh +++ b/include/kernels/evec_project.cuh @@ -47,25 +47,11 @@ namespace quda { __device__ __host__ spinor_array init() const { return spinor_array(); } }; - template __device__ int idx_from_t_xyz(int t, int xyz, T X[4]) - { - int x[4]; -#pragma unroll - for (int d = 0; d < 4; d++) { - if (d != reduction_dim) { - x[d] = xyz % X[d]; - xyz = xyz / X[d]; - } - } - x[reduction_dim] = t; - return (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]); - } - template struct EvecProjection : plus { using reduce_t = spinor_array; - using plus::operator(); - static constexpr int reduce_block_dim = 1; // only doing a reduct in the x thread dimension - + using plus::operator(); + static constexpr int reduce_block_dim = 1; // only doing a reduce in the x thread dimension + const Arg &arg; constexpr EvecProjection(const Arg &arg) : arg(arg) {} static constexpr const char *filename() { return KERNEL_FILE; } diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index 152dbbce2f..03659a722f 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -44,8 +44,8 @@ namespace quda LaplaceArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &halo, const GaugeField &U, int dir, double a, double b, - cvector_ref &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), + cvector_ref &x, int parity, const int *comm_override) : + DslashArg(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override), halo_pack(halo), halo(halo), U(U), @@ -74,7 +74,7 @@ namespace quda @param[in] thread_dim Which dimension this thread corresponds to (fused exterior only) */ - template + template __device__ __host__ inline void applyLaplace(Vector &out, Arg &arg, Coord &coord, int parity, int, int thread_dim, bool &active, int src_idx) { @@ -136,8 +136,7 @@ namespace quda } // out(x) = M*in - template struct laplace : - dslash_default { + template struct laplace : dslash_default { const Arg &arg; template constexpr laplace(const Ftor &ftor) : arg(ftor.arg) {} @@ -164,12 +163,10 @@ namespace quda // case 4 is an operator in all x,y,z,t dimensions // case 3 is a spatial operator only, the t dimension is omitted. switch (arg.dir) { - case 3: - applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); - break; + case 3: applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); break; case 4: default: - applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); + applyLaplace(out, arg, coord, parity, idx, thread_dim, active, src_idx); break; } diff --git a/include/lattice_field.h b/include/lattice_field.h index 462add3bde..2cf1f6beaa 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -902,6 +902,40 @@ namespace quda { #define checkPrecision(...) Precision_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** + @brief Helper function for determining if the color of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @return If color is unique return the number of colors + */ + template + inline int Color_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_) + { + const unwrap_t &a(a_); + const unwrap_t &b(b_); + int nColor = 0; + if (a.Ncolor() == b.Ncolor()) + nColor = a.Ncolor(); + else + errorQuda("Color %d %d do not match (%s:%d in %s())", a.Ncolor(), b.Ncolor(), file, line, func); + return nColor; + } + + /** + @brief Helper function for determining if the color of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @param[in] args List of additional fields to check color on + @return If colors is unique return the number of colors + */ + template + inline int Color_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args) + { + return Color_(func, file, line, a, b) & Color_(func, file, line, a, args...); + } + +#define checkColor(...) Color_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** @brief Helper function for determining if the field is in native order @param[in] a Input field diff --git a/include/quda.h b/include/quda.h index 155791b60a..da78a50a3b 100644 --- a/include/quda.h +++ b/include/quda.h @@ -9,6 +9,7 @@ * as the Fortran interface in lib/quda_fortran.F90. */ +#include // bool support #include #include /* for FILE */ #include @@ -283,8 +284,6 @@ extern "C" { double temp; /**< The mean temperature of the device for the duration of the solve */ double clock; /**< The mean clock frequency of the device for the duration of the solve */ - QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ - /** Number of steps in s-step algorithms */ int Nsteps; @@ -496,7 +495,7 @@ extern "C" { QudaBoolean preserve_deflation; /** This is where we store the deflation space. This will point - to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ + to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ void *preserve_deflation_space; /** If we restore the deflation space, this boolean indicates @@ -506,6 +505,10 @@ extern "C" { false, but preserve_deflation would be true */ QudaBoolean preserve_evals; + /** Whether to use the smeared gauge field for the Dirac operator + for whose eigenvalues are are computing. */ + bool use_smeared_gauge; + /** What type of Dirac operator we are using **/ /** If !(use_norm_op) && !(use_dagger) use M. **/ /** If use_dagger, use Mdag **/ @@ -569,6 +572,12 @@ extern "C" { /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ char QUDA_logfile[512]; + /** The orthogonal direction in the 3D eigensolver **/ + int ortho_dim; + + /** The size of the orthogonal direction in the 3D eigensolver, local **/ + int ortho_dim_size_local; + //------------------------------------------------- // EIG-CG PARAMS @@ -1485,7 +1494,7 @@ extern "C" { * @param inGauge Pointer to the device gauge field (QUDA device field) * @param param The parameters of the host and device fields */ - void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); + void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param); /** * Reinterpret gauge as a pointer to a GaugeField and call destructor. diff --git a/include/targets/cuda/reduce_helper.h b/include/targets/cuda/reduce_helper.h index 0614b3bda3..497e2b2b59 100644 --- a/include/targets/cuda/reduce_helper.h +++ b/include/targets/cuda/reduce_helper.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include using count_t = cuda::atomic; namespace quda diff --git a/include/tune_quda.h b/include/tune_quda.h index 03b9b148ee..b49ccc9753 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -43,6 +43,14 @@ namespace quda { */ const std::map &getTuneCache(); + /** + @brief Unify all instances of the tunecache across ranks. This + is called after returning to a global communicator. + @param[in] rank_list The list of ranks from whose tunecaches we + want to merge to form the global tunecache + */ + void joinTuneCache(const std::vector &rank_list); + /** @brief Return a string encoding the QUDA version */ @@ -61,7 +69,7 @@ namespace quda { class Tunable { - friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity); + friend TuneParam tuneLaunch(Tunable &, bool, QudaVerbosity); static inline uint64_t _flops_global = 0; static inline uint64_t _bytes_global = 0; @@ -417,7 +425,7 @@ namespace quda { * @param[in] verbosity What verbosity to use during tuning? * @return The tuned launch parameters */ - TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled = getTuning(), QudaVerbosity verbosity = getVerbosity()); + TuneParam tuneLaunch(Tunable &tunable, bool enabled = getTuning(), QudaVerbosity verbosity = getVerbosity()); /** * @brief Post an event in the trace, recording where it was posted diff --git a/include/util_quda.h b/include/util_quda.h index 3d68fb5a2e..031f878762 100644 --- a/include/util_quda.h +++ b/include/util_quda.h @@ -20,7 +20,25 @@ namespace quda @brief Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_ENABLE_TUNING=0. @return If autotuning is enabled */ -QudaTune getTuning(); +bool getTuning(); + +/** + @brief Set the tuning state + @param[in] tuning New tuning state + */ +void setTuning(bool tuning); + +/** + @brief Push a new tuning state, and back up the present one on the + stack. + @param[in] tune New tuning state + */ +void pushTuning(bool tune); + +/** + @brief Pop the present tuning state and restore what is on the stack. + */ +void popTuning(); QudaVerbosity getVerbosity(); char *getOutputPrefix(); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 950ac83a09..f9c2247b33 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -20,8 +20,9 @@ set (QUDA_OBJS solve.cpp monitor.cpp dirac_coarse.cpp dslash_coarse.cpp coarse_op.cpp coarsecoarse_op.cpp coarse_op_preconditioned.cpp staggered_coarse_op.cpp - eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp - eigensolve_quda.cpp quda_arpack_interface.cpp + eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp + eig_trlm_3d.cpp blas_3d.cu + vector_io.cpp eigensolve_quda.cpp quda_arpack_interface.cpp multigrid.cpp transfer.cpp block_orthogonalize.cpp prolongator.cpp restrictor.cpp staggered_prolong_restrict.cu gauge_phase.cu timer.cpp diff --git a/lib/blas_3d.cu b/lib/blas_3d.cu new file mode 100644 index 0000000000..acab3da85e --- /dev/null +++ b/lib/blas_3d.cu @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include + +namespace quda +{ + + namespace blas3d + { + + template class copy3D : TunableKernel2D + { + ColorSpinorField &y; + ColorSpinorField &x; + const int t_slice; + const copyType type; + unsigned int minThreads() const { return y.VolumeCB(); } + + public: + copy3D(ColorSpinorField &y, ColorSpinorField &x, int t_slice, copyType type) : + TunableKernel2D(y, y.SiteSubset()), y(y), x(x), t_slice(t_slice), type(type) + { + // Check slice value + if (t_slice < 0 || t_slice >= y.X()[3]) errorQuda("Unexpected slice %d", t_slice); + + strcat(aux, type == copyType::SWAP_3D ? ",swap_3d" : type == copyType::COPY_TO_3D ? ",to_3d" : ",from_3d"); + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + copy3dArg arg(y, x, t_slice); + switch (type) { + case copyType::COPY_TO_3D: launch(tp, stream, arg); break; + case copyType::COPY_FROM_3D: launch(tp, stream, arg); break; + case copyType::SWAP_3D: launch(tp, stream, arg); break; + default: errorQuda("Unknown 3D copy type"); + } + } + + void preTune() + { + x.backup(); + y.backup(); + } + void postTune() + { + x.restore(); + y.restore(); + } + long long bytes() const + { + return (type == copyType::SWAP_3D ? 2 : 1) * (x.Bytes() / x.X(3) + y.Bytes() / y.X(3)); + } + }; + + void copy(const int slice, const copyType type, ColorSpinorField &x, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + // Check orth dim + if (x.X()[3] != 1) errorQuda("Unexpected dimensions in x[3]=%d", x.X()[3]); + // We must give a 4D Lattice field as the first argument + instantiate(y, x, slice, type); + } + + void swap(int slice, ColorSpinorField &x, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + instantiate(x, y, slice, copyType::SWAP_3D); + } + + template class axpby3D : TunableKernel2D + { + const ColorSpinorField &x; + ColorSpinorField &y; + const std::vector &a; + const std::vector &b; + unsigned int minThreads() const override { return x.VolumeCB(); } + + public: + axpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector &a, const std::vector &b) : + TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) override + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + launch(tp, stream, axpby3dArg(a, x, b, y)); + } + + void preTune() override { y.backup(); } + void postTune() override { y.restore(); } + long long flops() const override { return 6 * x.Volume() * x.Nspin() * x.Ncolor(); } + long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); } + }; + + void axpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, a, b); + } + + void ax(const std::vector &a, ColorSpinorField &x) + { + std::vector zeros(a.size(), 0.0); + axpby(a, x, zeros, x); + } + + template class caxpby3D : TunableKernel2D + { + const ColorSpinorField &x; + ColorSpinorField &y; + const std::vector &a; + const std::vector &b; + unsigned int minThreads() const override { return x.VolumeCB(); } + + public: + caxpby3D(const ColorSpinorField &x, ColorSpinorField &y, const std::vector &a, + const std::vector &b) : + TunableKernel2D(x, x.SiteSubset()), x(x), y(y), a(a), b(b) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) override + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + launch(tp, stream, caxpby3dArg(a, x, b, y)); + } + + void preTune() override { y.backup(); } + void postTune() override { y.restore(); } + long long flops() const override { return 14 * x.Volume() * x.Nspin() * x.Ncolor(); } + long long bytes() const override { return x.Bytes() + 2 * y.Bytes(); } + }; + + void caxpby(const std::vector &a, const ColorSpinorField &x, const std::vector &b, + ColorSpinorField &y) + { + checkPrecision(x, y); + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (a.size() != b.size() && a.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array sizes a=%lu b=%lu, x[3]=%d", a.size(), b.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, a, b); + } + + template class reDotProduct3D : TunableMultiReduction + { + const ColorSpinorField &x; + const ColorSpinorField &y; + std::vector &result; + + public: + reDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector &result) : + TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + reDotProduct3dArg arg(x, y); + launch(result, tp, stream, arg); + } + + long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 2; } + long long bytes() const { return x.Bytes() + y.Bytes(); } + }; + + void reDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y) + { + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (result.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, result); + } + + template class cDotProduct3D : TunableMultiReduction + { + const ColorSpinorField &x; + const ColorSpinorField &y; + std::vector &result; + + public: + cDotProduct3D(const ColorSpinorField &x, const ColorSpinorField &y, std::vector &result) : + TunableMultiReduction(x, x.SiteSubset(), x.X()[3]), x(x), y(y), result(result) + { + apply(device::get_default_stream()); + } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + cDotProduct3dArg arg(x, y); + launch(result, tp, stream, arg); + } + + long long flops() const { return x.Volume() * x.Nspin() * x.Ncolor() * 8; } + long long bytes() const { return x.Bytes() + y.Bytes(); } + }; + + void cDotProduct(std::vector &result, const ColorSpinorField &x, const ColorSpinorField &y) + { + checkSpin(x, y); + checkColor(x, y); + + // Check coefficients + if (result.size() != (unsigned int)x.X()[3]) + errorQuda("Unexpected coeff array size a=%lu, x[3]=%d", result.size(), x.X()[3]); + + // We must give a Lattice field as the first argument + instantiate(x, y, result); + } + + } // namespace blas3d + +} // namespace quda diff --git a/lib/check_params.h b/lib/check_params.h index 639db3eb3d..cdbe36169b 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -175,6 +175,7 @@ void printQudaEigParam(QudaEigParam *param) { P(preserve_deflation, QUDA_BOOLEAN_FALSE); P(preserve_deflation_space, 0); P(preserve_evals, QUDA_BOOLEAN_TRUE); + P(use_smeared_gauge, false); P(use_dagger, QUDA_BOOLEAN_FALSE); P(use_norm_op, QUDA_BOOLEAN_FALSE); P(compute_svd, QUDA_BOOLEAN_FALSE); diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp index cc244b43b8..e0e652c26c 100644 --- a/lib/communicator_mpi.cpp +++ b/lib/communicator_mpi.cpp @@ -106,6 +106,7 @@ namespace quda } comm_init(nDim, commDims, rank_from_coords, map_data); + globalReduce.push(true); } @@ -174,6 +175,9 @@ namespace quda grid_size, size); } + // defer handling MPI errors to QUDA + MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN); + comm_init_common(ndim, dims, rank_from_coords, map_data); } diff --git a/lib/communicator_qmp.cpp b/lib/communicator_qmp.cpp index 1a4a2a7878..67d4a8694f 100644 --- a/lib/communicator_qmp.cpp +++ b/lib/communicator_qmp.cpp @@ -166,6 +166,9 @@ void Communicator::comm_init(int ndim, const int *dims, QudaCommsMap rank_from_c grid_size, QMP_comm_get_number_of_nodes(QMP_COMM_HANDLE)); } + // defer handling MPI errors to QMP + MPI_Comm_set_errhandler(MPI_COMM_HANDLE, MPI_ERRORS_RETURN); + comm_init_common(ndim, dims, rank_from_coords, map_data); } diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp index 4abf6e38a2..cc36909d51 100644 --- a/lib/communicator_stack.cpp +++ b/lib/communicator_stack.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace quda { @@ -49,8 +50,16 @@ namespace quda void push_communicator(const CommKey &split_key) { if (comm_nvshmem_enabled()) - errorQuda( - "Split-grid is currently not supported with NVSHMEM. Please set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM."); + errorQuda("Split-grid is currently not supported with NVSHMEM. Set QUDA_ENABLE_NVSHMEM=0 to disable NVSHMEM."); + + // if reverting to global, we will need to join the tunecaches + bool join_tune_cache = split_key == default_comm_key; + int local_rank = comm_rank(); + int local_tune_rank = 0; + + // used to store the size of the tunecache at the point of splitting + static size_t tune_cache_size = 0; + auto search = communicator_stack.find(split_key); if (search == communicator_stack.end()) { communicator_stack.emplace(std::piecewise_construct, std::forward_as_tuple(split_key), @@ -59,7 +68,38 @@ namespace quda LatticeField::freeGhostBuffer(); // Destroy the (IPC) Comm buffers with the old communicator. + auto split_key_old = current_key; current_key = split_key; + + // we are returning to global so we need to join any diverged tunecaches + if (join_tune_cache) { + // has this tunecache been updated? + int tune_cache_update = tune_cache_size != getTuneCache().size(); + // have any of tunecaches across the split grid been updated? + comm_allreduce_int(tune_cache_update); + + if (tune_cache_update) { + auto num_sub_partition = split_key_old.product(); // the number of caches we need to merge + int sub_partition_dims[] = {comm_dim(0) / split_key_old[0], comm_dim(1) / split_key_old[1], + comm_dim(2) / split_key_old[2], comm_dim(3) / split_key_old[3]}; + int sub_partition_coords[] = {comm_coord(0) / sub_partition_dims[0], comm_coord(1) / sub_partition_dims[1], + comm_coord(2) / sub_partition_dims[2], comm_coord(3) / sub_partition_dims[3]}; + auto sub_partition_idx = sub_partition_coords[split_key_old.n_dim - 1]; + for (auto d = split_key_old.n_dim - 2; d >= 0; d--) + sub_partition_idx = sub_partition_idx * split_key_old[d] + sub_partition_coords[d]; + + std::vector global_tune_ranks(num_sub_partition); + for (auto i = 0; i < num_sub_partition; i++) { + global_tune_ranks[i] = (sub_partition_idx == i && local_tune_rank == local_rank) ? comm_rank() : 0; + comm_allreduce_int(global_tune_ranks[i]); + } + // we now have a list of all the global tune ranks so we can join them + joinTuneCache(global_tune_ranks); + } + } else if (!join_tune_cache) { + // record size of tunecache when first splitting the grid + tune_cache_size = getTuneCache().size(); + } } #if defined(QMP_COMMS) || defined(MPI_COMMS) @@ -70,8 +110,12 @@ namespace quda int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); } + int comm_dim_global(int dim) { return get_default_communicator().comm_dim(dim); } + int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); } + int comm_coord_global(int dim) { return get_default_communicator().comm_coord(dim); } + int comm_rank_from_coords(const int *coords) { return get_current_communicator().comm_rank_from_coords(coords); } void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data, bool user_set_comm_handle, diff --git a/lib/eig_trlm_3d.cpp b/lib/eig_trlm_3d.cpp new file mode 100644 index 0000000000..7480a58395 --- /dev/null +++ b/lib/eig_trlm_3d.cpp @@ -0,0 +1,670 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace quda +{ + // Thick Restarted Lanczos Method constructor + TRLM3D::TRLM3D(const DiracMatrix &mat, QudaEigParam *eig_param) : EigenSolver(mat, eig_param) + { + getProfile().TPSTART(QUDA_PROFILE_INIT); + + ortho_dim = eig_param->ortho_dim; + ortho_dim_size = eig_param->ortho_dim_size_local; + if (ortho_dim != 3) + errorQuda("Only 3D spatial splitting (ortho_dim = 3) is supported, ortho_dim passed = %d", ortho_dim); + verbose_rank = comm_coord(0) == 0 && comm_coord(1) == 0 && comm_coord(2) == 0; + + // Tridiagonal/Arrow matrices + alpha_3D.resize(ortho_dim_size); + beta_3D.resize(ortho_dim_size); + + residua_3D.resize(ortho_dim_size); + ritz_mat_3D.resize(ortho_dim_size); + converged_3D.resize(ortho_dim_size, false); + active_3D.resize(ortho_dim_size, false); + + iter_locked_3D.resize(ortho_dim_size, 0); + iter_keep_3D.resize(ortho_dim_size, 0); + iter_converged_3D.resize(ortho_dim_size, 0); + + num_locked_3D.resize(ortho_dim_size, 0); + num_keep_3D.resize(ortho_dim_size, 0); + num_converged_3D.resize(ortho_dim_size, 0); + + for (int i = 0; i < ortho_dim_size; i++) { + alpha_3D[i].resize(n_kr, 0.0); + beta_3D[i].resize(n_kr, 0.0); + residua_3D[i].resize(n_kr, 0.0); + } + + // 3D thick restart specific checks + if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6); + + if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) { + errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver"); + } + + getProfile().TPSTOP(QUDA_PROFILE_INIT); + } + + void TRLM3D::operator()(std::vector &kSpace, std::vector &evals) + { + // Create 3-d split communicators + CommKey split_key = {1, 1, 1, comm_dim(3)}; + + if (!split_key.is_valid()) { + errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]); + } + logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n", + comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]); + push_communicator(split_key); + + // Override any user input for block size. + block_size = 1; + + // Pre-launch checks and preparation + //--------------------------------------------------------------------------- + queryPrec(kSpace[0].Precision()); + // Check to see if we are loading eigenvectors + if (strcmp(eig_param->vec_infile, "") != 0) { + printfQuda("Loading evecs from file name %s\n", eig_param->vec_infile); + loadFromFile(kSpace, evals); + return; + } + + // Check for an initial guess. If none present, populate with rands, then + // orthonormalise + prepareInitialGuess3D(kSpace, ortho_dim_size); + + // Increase the size of kSpace passed to the function, will be trimmed to + // original size before exit. + prepareKrylovSpace(kSpace, evals); + + // Check for Chebyshev maximum estimation + checkChebyOpMax(kSpace); + + // Convergence and locking criteria + std::vector mat_norm_3D(ortho_dim_size, 0.0); + double epsilon = setEpsilon(kSpace[0].Precision()); + + // Print Eigensolver params + printEigensolverSetup(); + //--------------------------------------------------------------------------- + + // Begin TRLM Eigensolver computation + //--------------------------------------------------------------------------- + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + int t_offset = ortho_dim_size * comm_coord(3); + + // Loop over restart iterations. + while (restart_iter < max_restarts && !converged) { + + // Get min step + int step_min = *std::min_element(num_locked_3D.begin(), num_locked_3D.end()); + comm_allreduce_min(step_min); + + for (int step = step_min; step < n_kr; step++) lanczosStep3D(kSpace, step); + iter += (n_kr - step_min); + + // The eigenvalues are returned in the alpha array + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + eigensolveFromArrowMat3D(); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + + // mat_norm is updated and used for LR + for (int t = 0; t < ortho_dim_size; t++) + for (int i = num_locked_3D[t]; i < n_kr; i++) + if (fabs(alpha_3D[t][i]) > mat_norm_3D[t]) mat_norm_3D[t] = fabs(alpha_3D[t][i]); + + // Lambda that returns mat_norm for LR and returns the relevant alpha + // (the corresponding Ritz value) for SR + auto check_norm = [&](double sr_norm, int t) -> double { + if (eig_param->spectrum == QUDA_SPECTRUM_LR_EIG) + return mat_norm_3D[t]; + else + return sr_norm; + }; + + // Locking check + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + iter_locked_3D[t] = 0; + for (int i = 0; i < (n_kr - num_locked_3D[t]); i++) { + if (residua_3D[t][i + num_locked_3D[t]] < epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) { + logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d %d resid=%+.6e condition=%.6e ****\n", t, i, + residua_3D[t][i + num_locked_3D[t]], epsilon * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)); + iter_locked_3D[t] = i + 1; + } else { + // Unlikely to find new locked pairs + break; + } + } + } + } + + // Convergence check + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + iter_converged_3D[t] = iter_locked_3D[t]; + for (int i = iter_locked_3D[t]; i < n_kr - num_locked_3D[t]; i++) { + if (residua_3D[t][i + num_locked_3D[t]] < tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)) { + logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d %d resid=%+.6e condition=%.6e ****\n", t, i, + residua_3D[t][i + num_locked_3D[t]], tol * check_norm(alpha_3D[t][i + num_locked_3D[t]], t)); + iter_converged_3D[t] = i + 1; + } else { + // Unlikely to find new converged pairs + break; + } + } + } + } + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) + iter_keep_3D[t] + = std::min(iter_converged_3D[t] + (n_kr - num_converged_3D[t]) / 2, n_kr - num_locked_3D[t] - 12); + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + computeKeptRitz3D(kSpace); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + + int min_nconv = n_kr + 1; + int min_nlock = n_kr + 1; + int min_nkeep = n_kr + 1; + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + num_converged_3D[t] = num_locked_3D[t] + iter_converged_3D[t]; + num_keep_3D[t] = num_locked_3D[t] + iter_keep_3D[t]; + num_locked_3D[t] += iter_locked_3D[t]; + + if (num_converged_3D[t] < min_nconv) min_nconv = num_converged_3D[t]; + if (num_locked_3D[t] < min_nlock) min_nlock = num_locked_3D[t]; + if (num_keep_3D[t] < min_nkeep) min_nkeep = num_keep_3D[t]; + + // Use printf to get data from t dim only + if (getVerbosity() >= QUDA_DEBUG_VERBOSE && verbose_rank) { + printf("%04d converged eigenvalues for timeslice %d at restart iter %04d\n", num_converged_3D[t], + t_offset + t, restart_iter + 1); + printf("iter Conv[%d] = %d\n", t_offset + t, iter_converged_3D[t]); + printf("iter Keep[%d] = %d\n", t_offset + t, iter_keep_3D[t]); + printf("iter Lock[%d] = %d\n", t_offset + t, iter_locked_3D[t]); + printf("num_converged[%d] = %d\n", t_offset + t, num_converged_3D[t]); + printf("num_keep[%d] = %d\n", t_offset + t, num_keep_3D[t]); + printf("num_locked[%d] = %d\n", t_offset + t, num_locked_3D[t]); + for (int i = 0; i < n_kr; i++) { + printf("Ritz[%d][%d] = %.16e residual[%d] = %.16e\n", t_offset + t, i, alpha_3D[t][i], i, residua_3D[t][i]); + } + } + } + } + + // Check for convergence + bool all_converged = true; + for (int t = 0; t < ortho_dim_size; t++) { + if (num_converged_3D[t] >= n_conv) { + converged_3D[t] = true; + } else { + all_converged = false; + } + } + + if (getVerbosity() >= QUDA_VERBOSE && verbose_rank) { + std::stringstream converged; + std::copy(converged_3D.begin(), converged_3D.end(), std::ostream_iterator(converged, "")); + printf("iter = %d rank = %d converged = %s min nlock %3d nconv %3d nkeep %3d\n", restart_iter + 1, + comm_rank_global(), converged.str().c_str(), min_nlock, min_nconv, min_nkeep); + } + + if (all_converged) { + reorder3D(kSpace); + converged = true; + } + + restart_iter++; + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + + // Post computation report + //--------------------------------------------------------------------------- + if (!converged) { + if (eig_param->require_convergence) { + errorQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d " + "restart steps. Exiting.", + n_conv, n_ev, n_kr, max_restarts); + } else { + warningQuda("TRLM failed to compute the requested %d vectors with a %d search space and %d Krylov space in %d " + "restart steps. Continuing with current Lanczos factorisation.", + n_conv, n_ev, n_kr, max_restarts); + // Compute eigenvalues + computeEvals(kSpace, evals); + } + } else { + if (verbose_rank) { + if (getVerbosity() >= QUDA_SUMMARIZE) { + printf("TRLM (rank = %d) computed the requested %d vectors in %d restart steps and %d OP*x operations\n", + comm_rank_global(), n_conv, restart_iter, iter); + } + + if (getVerbosity() >= QUDA_VERBOSE) { + // Dump all Ritz values and residua if using Chebyshev + if (eig_param->use_poly_acc) { + for (int t = 0; t < ortho_dim_size; t++) { + for (int i = 0; i < n_conv; i++) { + printf("RitzValue[%d][%04d]: (%+.16e, %+.16e) residual %.16e\n", t, i, alpha_3D[t][i], 0.0, + residua_3D[t][i]); + } + } + } + } + } + + // Compute eigenvalues + computeEvals(kSpace, evals); + } + + push_communicator(default_comm_key); + + // ensure all processes have all e-values + comm_allreduce_sum(evals); + + // Local clean-up + cleanUpEigensolver(kSpace, evals); + } + + // Thick Restart 3D Member functions + //--------------------------------------------------------------------------- + void TRLM3D::lanczosStep3D(std::vector &v, int j) + { + // Compute r[t] = A[t] * v_j[t] - b_{j-i}[t] * v_{j-1}[t] + // r[t] = A[t] * v_j[t] + + // Use this while we have only axpby in place of axpy; + std::vector unit(ortho_dim_size, 1.0); + std::vector alpha_j(ortho_dim_size, 0.0); + std::vector beta_j(ortho_dim_size, 0.0); + + // 3D vectors that hold data for individual t components + ColorSpinorParam csParamClone(v[0]); + csParamClone.change_dim(ortho_dim, 1); + std::vector vecs_t(1, csParamClone); + ColorSpinorField r_t(csParamClone); + + // Identify active 3D slices. The active_3D array + // should be modified here only throughout the entire + // algorithm. + for (int t = 0; t < ortho_dim_size; t++) { + // Every element of the active array must be assessed + active_3D[t] = (num_keep_3D[t] <= j && !converged_3D[t]) ? true : false; + } + + // This will be a blocked operator with no + // connections in the ortho_dim (usually t) + // hence the 3D sections of each vector + // will be independent. + chebyOp(r[0], v[j]); + + // a_j[t] = v_j^dag[t] * r[t] + blas3d::reDotProduct(alpha_j, v[j], r[0]); + for (int t = 0; t < ortho_dim_size; t++) { + // Only active problem data is recorded + if (active_3D[t]) alpha_3D[t][j] = alpha_j[t]; + } + + // r[t] = r[t] - a_j[t] * v_j[t] + for (int t = 0; t < ortho_dim_size; t++) alpha_j[t] = active_3D[t] ? -alpha_j[t] : 0.0; + blas3d::axpby(alpha_j, v[j], unit, r[0]); + + // r[t] = r[t] - b_{j-1}[t] * v_{j-1}[t] + // Only orthogonalise active problems + for (int t = 0; t < ortho_dim_size; t++) { + if (active_3D[t]) { + int start = (j > num_keep_3D[t]) ? j - 1 : 0; + if (j - start > 0) { + + // Ensure we have enough 3D vectors + if ((int)vecs_t.size() < j - start) resize(vecs_t, j - start, csParamClone); + + // Copy the 3D data into the 3D vectors, create beta array, and create + // pointers to the 3D vectors + std::vector beta_t; + beta_t.reserve(j - start); + // Copy residual + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, r_t, r[0]); + // Copy vectors + for (int i = start; i < j; i++) { + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i - start], v[i]); + beta_t.push_back(-beta_3D[t][i]); + } + + // r[t] = r[t] - beta[t]{j-1} * v[t]{j-1} + blas::block::axpy(beta_t, {vecs_t.begin(), vecs_t.begin() + j - start}, r_t); + + // Copy residual back to 4D vector + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, r_t, r[0]); + } + } + } + + // Orthogonalise r against the Krylov space + for (int k = 0; k < 1; k++) blockOrthogonalize3D(v, r, j + 1); // future work: up to 4 times??? + + // b_j[t] = ||r[t]|| + blas3d::reDotProduct(beta_j, r[0], r[0]); + for (int t = 0; t < ortho_dim_size; t++) beta_j[t] = active_3D[t] ? sqrt(beta_j[t]) : 0.0; + + // Prepare next step. + // v_{j+1}[t] = r[t] / b_{j}[t] + for (int t = 0; t < ortho_dim_size; t++) { + if (active_3D[t]) { + beta_3D[t][j] = beta_j[t]; + beta_j[t] = 1.0 / beta_j[t]; + } + } + + std::vector c(ortho_dim_size); + for (int t = 0; t < ortho_dim_size; t++) c[t] = beta_j[t] == 0.0 ? 1.0 : 0.0; + blas3d::axpby(beta_j, r[0], c, v[j + 1]); + } + + void TRLM3D::reorder3D(std::vector &kSpace) + { + for (int t = 0; t < ortho_dim_size; t++) { + int i = 0; + if (reverse) { + while (i < n_kr) { + if ((i == 0) || (alpha_3D[t][i - 1] >= alpha_3D[t][i])) + i++; + else { + std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]); + blas3d::swap(t, kSpace[i], kSpace[i - 1]); + i--; + } + } + } else { + while (i < n_kr) { + if ((i == 0) || (alpha_3D[t][i - 1] <= alpha_3D[t][i])) + i++; + else { + std::swap(alpha_3D[t][i], alpha_3D[t][i - 1]); + blas3d::swap(t, kSpace[i], kSpace[i - 1]); + i--; + } + } + } + } + } + + void TRLM3D::eigensolveFromArrowMat3D() + { + getProfile().TPSTART(QUDA_PROFILE_EIGEN); + + // Loop over the 3D problems +#pragma omp parallel for + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + + int dim = n_kr - num_locked_3D[t]; + int arrow_pos = num_keep_3D[t] - num_locked_3D[t]; + + // Eigen objects + MatrixXd A = MatrixXd::Zero(dim, dim); + ritz_mat_3D[t].resize(dim * dim); + for (int i = 0; i < dim * dim; i++) ritz_mat_3D[t][i] = 0.0; + + // Invert the spectrum due to chebyshev + if (reverse) { + for (int i = num_locked_3D[t]; i < n_kr - 1; i++) { + alpha_3D[t][i] *= -1.0; + beta_3D[t][i] *= -1.0; + } + alpha_3D[t][n_kr - 1] *= -1.0; + } + + // Construct arrow mat A_{dim,dim} + for (int i = 0; i < dim; i++) { + + // alpha_3D populates the diagonal + A(i, i) = alpha_3D[t][i + num_locked_3D[t]]; + } + + for (int i = 0; i < arrow_pos; i++) { + + // beta_3D populates the arrow + A(i, arrow_pos) = beta_3D[t][i + num_locked_3D[t]]; + A(arrow_pos, i) = beta_3D[t][i + num_locked_3D[t]]; + } + + for (int i = arrow_pos; i < dim - 1; i++) { + + // beta_3D populates the sub-diagonal + A(i, i + 1) = beta_3D[t][i + num_locked_3D[t]]; + A(i + 1, i) = beta_3D[t][i + num_locked_3D[t]]; + } + + // Eigensolve the arrow matrix + SelfAdjointEigenSolver eigensolver; + eigensolver.compute(A); + + // repopulate ritz matrix + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) ritz_mat_3D[t][dim * i + j] = eigensolver.eigenvectors().col(i)[j]; + + for (int i = 0; i < dim; i++) { + residua_3D[t][i + num_locked_3D[t]] = fabs(beta_3D[t][n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]); + // Update the alpha_3D array + alpha_3D[t][i + num_locked_3D[t]] = eigensolver.eigenvalues()[i]; + } + + // Put spectrum back in order + if (reverse) { + for (int i = num_locked_3D[t]; i < n_kr; i++) { alpha_3D[t][i] *= -1.0; } + } + } + } + + getProfile().TPSTOP(QUDA_PROFILE_EIGEN); + } + + void TRLM3D::computeKeptRitz3D(std::vector &kSpace) + { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + // Multi-BLAS friendly array to store part of Ritz matrix we want + std::vector> ritz_mat_keep(ortho_dim_size); + + std::vector vecs_t; + std::vector kSpace_t; + + ColorSpinorParam csParamClone(kSpace[0]); + csParamClone.create = QUDA_ZERO_FIELD_CREATE; + csParamClone.change_dim(ortho_dim, 1); + + for (int t = 0; t < ortho_dim_size; t++) { + if (!converged_3D[t]) { + int dim = n_kr - num_locked_3D[t]; + int keep = iter_keep_3D[t]; + + ritz_mat_keep[t].resize(dim * keep); + for (int j = 0; j < dim; j++) { + for (int i = 0; i < keep; i++) { ritz_mat_keep[t][j * keep + i] = ritz_mat_3D[t][i * dim + j]; } + } + + // Alias the vectors we wish to keep. + vector_ref vecs_locked(kSpace.begin() + num_locked_3D[t], + kSpace.begin() + num_locked_3D[t] + dim); + + // multiBLAS axpy. Create 3D vectors so that we may perform all t independent + // vector rotations. + vecs_t.reserve(dim); + kSpace_t.reserve(keep); + + // Create 3D arrays + for (int i = vecs_t.size(); i < dim; i++) vecs_t.push_back(csParamClone); + for (int i = kSpace_t.size(); i < keep; i++) kSpace_t.push_back(csParamClone); + + blas::zero(kSpace_t); + + // Copy to data to 3D array, zero out workspace, make pointers + for (int i = 0; i < dim; i++) blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[i], vecs_locked[i]); + + // Compute the axpy + blas::block::axpy(ritz_mat_keep[t], {vecs_t.begin(), vecs_t.begin() + dim}, + {kSpace_t.begin(), kSpace_t.begin() + keep}); + + // Copy back to the 4D workspace array + + // Copy compressed Krylov + for (int i = 0; i < keep; i++) { + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, kSpace_t[i], kSpace[num_locked_3D[t] + i]); + } + + // Update residual vector + blas3d::copy(t, blas3d::copyType::COPY_TO_3D, vecs_t[0], kSpace[n_kr]); + blas3d::copy(t, blas3d::copyType::COPY_FROM_3D, vecs_t[0], kSpace[num_locked_3D[t] + keep]); + + // Update sub arrow matrix + for (int i = 0; i < keep; i++) + beta_3D[t][i + num_locked_3D[t]] = beta_3D[t][n_kr - 1] * ritz_mat_3D[t][dim * (i + 1) - 1]; + } + } + + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); + } + + // Orthogonalise r[t][0:] against V_[t][0:j] + void TRLM3D::blockOrthogonalize3D(std::vector &vecs, std::vector &rvecs, int j) + { + for (int i = 0; i < j; i++) { + std::vector s_t(ortho_dim_size, 0.0); + std::vector unit_new(ortho_dim_size, {1.0, 0.0}); + + // Block dot products stored in s_t. + blas3d::cDotProduct(s_t, vecs[i], rvecs[0]); + for (int t = 0; t < ortho_dim_size; t++) { s_t[t] *= active_3D[t] ? -1.0 : 0.0; } + + // Block orthogonalise + blas3d::caxpby(s_t, vecs[i], unit_new, rvecs[0]); + } + } + + void TRLM3D::prepareInitialGuess3D(std::vector &kSpace, int ortho_dim_size) + { + auto nrm = blas::norm2(kSpace[0]); + if (!std::isfinite(nrm) || nrm == 0.0) { + RNG rng(kSpace[0], 1234); + spinorNoise(kSpace[0], rng, QUDA_NOISE_UNIFORM); + } + + std::vector norms(ortho_dim_size, 0.0); + std::vector zeros(ortho_dim_size, 0.0); + blas3d::reDotProduct(norms, kSpace[0], kSpace[0]); + for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]); + blas3d::axpby(norms, kSpace[0], zeros, kSpace[0]); + } + + double TRLM3D::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) + { + RNG rng(in, 1234); + spinorNoise(in, rng, QUDA_NOISE_UNIFORM); + + // Power iteration + std::vector norms(ortho_dim_size, 0.0); + std::vector zeros(ortho_dim_size, 0.0); + for (int i = 0; i < 100; i++) { + if ((i + 1) % 10 == 0) { + blas3d::reDotProduct(norms, in, in); + for (int t = 0; t < ortho_dim_size; t++) norms[t] = 1.0 / sqrt(norms[t]); + blas3d::axpby(norms, in, zeros, in); + } + mat(out, in); + std::swap(out, in); + } + + // Compute spectral radius estimate + std::vector inner_products(ortho_dim_size, 0.0); + blas3d::reDotProduct(inner_products, out, in); + + auto result = *std::max_element(inner_products.begin(), inner_products.end()); + comm_allreduce_max(result); + if (verbose_rank && getVerbosity() >= QUDA_VERBOSE) + printf("Chebyshev max = %e (rank = %d)\n", result, comm_rank_global()); + + // Increase final result by 10% for safety + return result * 1.10; + } + + void TRLM3D::computeEvals(std::vector &evecs, std::vector &evals, int size) + { + if (size == 0) size = n_conv; + if (size > (int)evecs.size()) + errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size()); + if (size > (int)evals.size()) + errorQuda("Requesting %d eigenvalues with only storage allocated for %lu", size, evals.size()); + + ColorSpinorParam csParamClone(evecs[0]); + ColorSpinorField tmp(csParamClone); + + std::vector> evals_t(size, std::vector(ortho_dim_size, 0.0)); + std::vector norms(ortho_dim_size, 0.0); + std::vector unit(ortho_dim_size, 1.0); + std::vector n_unit(ortho_dim_size, {-1.0, 0.0}); + + for (int i = 0; i < size; i++) { + // r = A * v_i + mat(tmp, evecs[i]); + + // lambda_i = v_i^dag A v_i / (v_i^dag * v_i) + // Block dot products stored in s_t. + blas3d::cDotProduct(evals_t[i], evecs[i], tmp); + blas3d::reDotProduct(norms, evecs[i], evecs[i]); + for (int t = 0; t < ortho_dim_size; t++) evals_t[i][t] /= sqrt(norms[t]); + + // Measure ||lambda_i*v_i - A*v_i|| + blas3d::caxpby(evals_t[i], evecs[i], n_unit, tmp); + blas3d::reDotProduct(norms, tmp, tmp); + for (int t = 0; t < ortho_dim_size; t++) residua_3D[t][i] = sqrt(norms[t]); + } + + // If size = n_conv, this routine is called post sort + if (size == n_conv) { + evals.resize(ortho_dim_size * comm_dim_global(ortho_dim) * n_conv, 0.0); + + if (verbose_rank) { + int t_offset = ortho_dim_size * comm_coord_global(3); + for (int t = 0; t < ortho_dim_size; t++) { + for (int i = 0; i < size; i++) { + + // Use printf to get data from t dim only + if (getVerbosity() >= QUDA_VERBOSE) { + printf("Eval[%02d][%04d] = (%+.16e,%+.16e) residual = %+.16e\n", t_offset + t, i, evals_t[i][t].real(), + evals_t[i][t].imag(), residua_3D[t][i]); + } + + // Transfer evals to eval array + evals[(t_offset + t) * size + i] = evals_t[i][t]; + } + } + } + } + } + +} // namespace quda diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 8130897cbb..52f4fd1100 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -104,6 +104,10 @@ namespace quda logQuda(QUDA_VERBOSE, "Creating TR Lanczos eigensolver\n"); eig_solver = new TRLM(mat, eig_param); break; + case QUDA_EIG_TR_LANCZOS_3D: + logQuda(QUDA_VERBOSE, "Creating TR Lanczos 3-d eigensolver\n"); + eig_solver = new TRLM3D(mat, eig_param); + break; case QUDA_EIG_BLK_TR_LANCZOS: logQuda(QUDA_VERBOSE, "Creating Block TR Lanczos eigensolver\n"); eig_solver = new BLKTRLM(mat, eig_param); @@ -236,7 +240,7 @@ namespace quda int n_eig = n_conv; if (compute_svd) n_eig *= 2; kSpace.resize(n_eig); - evals.resize(n_conv); + if (eig_param->eig_type != QUDA_EIG_TR_LANCZOS_3D) evals.resize(n_conv); // Only save if outfile is defined if (strcmp(eig_param->vec_outfile, "") != 0) { @@ -580,6 +584,7 @@ namespace quda void EigenSolver::computeEvals(std::vector &evecs, std::vector &evals, int size) { + if (size == 0) size = n_conv; auto batch_size = eig_param->compute_evals_batch_size; if (size > static_cast(evecs.size())) @@ -683,7 +688,7 @@ namespace quda case QUDA_SPECTRUM_SR_EIG: printfQuda("'SR' -> sort with real(x) in increasing algebraic order, smallest first.\n"); break; case QUDA_SPECTRUM_LI_EIG: printfQuda("'LI' -> sort with imag(x) in decreasing algebraic order, largest first.\n"); break; case QUDA_SPECTRUM_SI_EIG: printfQuda("'SI' -> sort with imag(x) in increasing algebraic order, smallest first\n"); break; - default: errorQuda("Unkown spectrum type requested: %d", spec_type); + default: errorQuda("Unknown spectrum type requested: %d", spec_type); } } diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp index 00843bb272..752670b676 100644 --- a/lib/gauge_laplace.cpp +++ b/lib/gauge_laplace.cpp @@ -29,7 +29,7 @@ namespace quda { comm_dim[i] = comm_dim_partitioned(i); if (laplace3D == i) comm_dim[i] = 0; } - ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile); + ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, comm_dim, profile); } void GaugeLaplace::DslashXpay(cvector_ref &out, cvector_ref &in, @@ -43,7 +43,7 @@ namespace quda { comm_dim[i] = comm_dim_partitioned(i); if (laplace3D == i) comm_dim[i] = 0; } - ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile); + ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, comm_dim, profile); } void GaugeLaplace::M(cvector_ref &out, cvector_ref &in) const @@ -79,7 +79,11 @@ namespace quda { // do nothing } - GaugeLaplacePC::GaugeLaplacePC(const DiracParam ¶m) : GaugeLaplace(param) { } + GaugeLaplacePC::GaugeLaplacePC(const DiracParam ¶m) : GaugeLaplace(param) + { + for (auto i = 0; i < 4; i++) + if (laplace3D == i) errorQuda("Cannot use 3-d operator with e/o preconditioning"); + } GaugeLaplacePC::GaugeLaplacePC(const GaugeLaplacePC &dirac) : GaugeLaplace(dirac) { } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 1d0f7d3688..6fd6382488 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1434,7 +1434,7 @@ void endQuda(void) namespace quda { - void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { double kappa = inv_param->kappa; if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { @@ -1549,7 +1549,7 @@ namespace quda { diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel; } - void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1567,7 +1567,7 @@ namespace quda { inv_param->cuda_prec_sloppy); } - void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracRefineParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1586,7 +1586,7 @@ namespace quda { } // The preconditioner currently mimicks the sloppy operator with no comms - void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc, bool comms) + void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool comms) { setDiracParam(diracParam, inv_param, pc); @@ -1617,7 +1617,7 @@ namespace quda { inv_param->cuda_prec_precondition); } - void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) + void setDiracEigParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc, bool use_smeared_gauge) { setDiracParam(diracParam, inv_param, pc); @@ -1625,6 +1625,20 @@ namespace quda { diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatExtended : gaugeExtended; diracParam.fatGauge = gaugeFatExtended; diracParam.longGauge = gaugeLongExtended; + } else if (use_smeared_gauge) { + if (!gaugeSmeared) errorQuda("No smeared gauge field present"); + if (inv_param->dslash_type == QUDA_LAPLACE_DSLASH) { + if (gaugeSmeared->GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) { + GaugeFieldParam gauge_param(*gaugePrecise); + GaugeField gaugeEig(gauge_param); + copyExtendedGauge(gaugeEig, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); + gaugeEig.exchangeGhost(); + std::swap(gaugeEig, *gaugeSmeared); + } + diracParam.gauge = gaugeSmeared; + } else { + errorQuda("Smeared gauge field not supported for operator %d", inv_param->dslash_type); + } } else { diracParam.gauge = inv_param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatEigensolver : gaugeEigensolver; diracParam.fatGauge = gaugeFatEigensolver; @@ -1646,7 +1660,7 @@ namespace quda { inv_param->cuda_prec_eigensolver); } - void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve) + void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, bool pc_solve) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1663,8 +1677,7 @@ namespace quda { dPre = Dirac::create(diracPreParam); } - void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, - const bool pc_solve) + void createDiracWithRefine(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dRef, QudaInvertParam ¶m, bool pc_solve) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1684,8 +1697,8 @@ namespace quda { dRef = Dirac::create(diracRefParam); } - void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, - const bool pc_solve) + void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, bool pc_solve, + bool use_smeared_gauge) { DiracParam diracParam; DiracParam diracSloppyParam; @@ -1696,7 +1709,7 @@ namespace quda { setDiracSloppyParam(diracSloppyParam, ¶m, pc_solve); bool pre_comms_flag = (param.schwarz_type != QUDA_INVALID_SCHWARZ) ? false : true; setDiracPreParam(diracPreParam, ¶m, pc_solve, pre_comms_flag); - setDiracEigParam(diracEigParam, ¶m, pc_solve); + setDiracEigParam(diracEigParam, ¶m, pc_solve, use_smeared_gauge); d = Dirac::create(diracParam); // create the Dirac operator dSloppy = Dirac::create(diracSloppyParam); @@ -2392,14 +2405,17 @@ void checkClover(QudaInvertParam *param) { quda::GaugeField *checkGauge(QudaInvertParam *param) { - quda::GaugeField *cudaGauge = nullptr; - if (param->dslash_type != QUDA_ASQTAD_DSLASH) { - if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist"); + quda::GaugeField *U = param->dslash_type == QUDA_ASQTAD_DSLASH ? gaugeFatPrecise : + gaugePrecise; - if (param->cuda_prec != gaugePrecise->Precision()) { - errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugePrecise->Precision()); - } + if (U == nullptr) + errorQuda("Precise gauge %sfield doesn't exist", param->dslash_type == QUDA_ASQTAD_DSLASH ? "fat " : ""); + + if (param->cuda_prec != U->Precision()) { + errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, U->Precision()); + } + if (param->dslash_type != QUDA_ASQTAD_DSLASH) { if (param->cuda_prec_sloppy != gaugeSloppy->Precision() || param->cuda_prec_precondition != gaugePrecondition->Precision() || param->cuda_prec_refinement_sloppy != gaugeRefinement->Precision() @@ -2416,18 +2432,10 @@ quda::GaugeField *checkGauge(QudaInvertParam *param) if (gaugePrecondition == nullptr) errorQuda("Precondition gauge field doesn't exist"); if (gaugeRefinement == nullptr) errorQuda("Refinement gauge field doesn't exist"); if (gaugeEigensolver == nullptr) errorQuda("Refinement gauge field doesn't exist"); - if (param->overlap) { - if (gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist"); - } - cudaGauge = gaugePrecise; + if (param->overlap && gaugeExtended == nullptr) errorQuda("Extended gauge field doesn't exist"); } else { - if (gaugeFatPrecise == nullptr) errorQuda("Precise gauge fat field doesn't exist"); if (gaugeLongPrecise == nullptr) errorQuda("Precise gauge long field doesn't exist"); - if (param->cuda_prec != gaugeFatPrecise->Precision()) { - errorQuda("Solve precision %d doesn't match gauge precision %d", param->cuda_prec, gaugeFatPrecise->Precision()); - } - if (param->cuda_prec_sloppy != gaugeFatSloppy->Precision() || param->cuda_prec_precondition != gaugeFatPrecondition->Precision() || param->cuda_prec_refinement_sloppy != gaugeFatRefinement->Precision() @@ -2450,23 +2458,18 @@ quda::GaugeField *checkGauge(QudaInvertParam *param) if (gaugeFatPrecondition == nullptr) errorQuda("Precondition gauge fat field doesn't exist"); if (gaugeFatRefinement == nullptr) errorQuda("Refinement gauge fat field doesn't exist"); if (gaugeFatEigensolver == nullptr) errorQuda("Eigensolver gauge fat field doesn't exist"); - if (param->overlap) { - if (gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist"); - } + if (param->overlap && gaugeFatExtended == nullptr) errorQuda("Extended gauge fat field doesn't exist"); if (gaugeLongSloppy == nullptr) errorQuda("Sloppy gauge long field doesn't exist"); if (gaugeLongPrecondition == nullptr) errorQuda("Precondition gauge long field doesn't exist"); if (gaugeLongRefinement == nullptr) errorQuda("Refinement gauge long field doesn't exist"); if (gaugeLongEigensolver == nullptr) errorQuda("Eigensolver gauge long field doesn't exist"); - if (param->overlap) { - if (gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist"); - } - cudaGauge = gaugeFatPrecise; + if (param->overlap && gaugeLongExtended == nullptr) errorQuda("Extended gauge long field doesn't exist"); } checkClover(param); - return cudaGauge; + return U; } void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse) @@ -2582,7 +2585,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Create the dirac operator with a sloppy and a precon. bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); - createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve); + createDiracWithEig(d, dSloppy, dPre, dEig, *inv_param, pc_solve, eig_param->use_smeared_gauge); Dirac &dirac = *dEig; //------------------------------------------------------ @@ -2679,7 +2682,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Transfer Eigenpairs back to host if using GPU eigensolver. The copy // will automatically rotate from device UKQCD gamma basis to the // host side gamma basis. - for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); } + memcpy(host_evals, evals.data(), sizeof(Complex) * evals.size()); + if (!(eig_param->arpack_check)) { for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i]; } @@ -4977,7 +4981,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, for (unsigned int i = 0; i < n_steps; i++) { if (i) in = out; - ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal); + ApplyLaplace(out, in, *precise, 3, a, b, in, parity, comm_dim, profileWuppertal); logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out)); } @@ -5123,7 +5127,9 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); - logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); + logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", 0, + obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1], + obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge); // set default dir_ignore = 3 for APE and STOUT for compatibility int dir_ignore = smear_param->dir_ignore; @@ -5142,13 +5148,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable case QUDA_GAUGE_SMEAR_HYP: HYPStep(*gaugeSmeared, tmp, smear_param->alpha1, smear_param->alpha2, smear_param->alpha3, dir_ignore); break; - default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); + default: errorQuda("Unknown gauge smear type %d", smear_param->smear_type); } if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; gaugeObservablesQuda(&obs_param[measurement_n]); - logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge); + logQuda(QUDA_SUMMARIZE, "step %03d plaquette (mean %.16e, spatial %.16e temporal %.16e) Q charge = %.16e\n", + i + 1, obs_param[measurement_n].plaquette[0], obs_param[measurement_n].plaquette[1], + obs_param[measurement_n].plaquette[2], obs_param[measurement_n].qcharge); } } @@ -5297,7 +5305,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] = Laplace [0] copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp0, precise, 4, a, b, f_temp0, parity, comm_dim, profileGFlow); // [0] = [4] = Laplace [0] = Laplace [3] f_temp0 = f_temp4; @@ -5315,7 +5323,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] <- Laplace [1] copyExtendedGauge(precise, gout, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp1, precise, 4, a, b, f_temp1, parity, comm_dim, profileGFlow); // [1] <- [4] f_temp1 = f_temp4; @@ -5333,7 +5341,7 @@ void performGFlowQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaG // [4] <- Laplace [2] copyExtendedGauge(precise, gin, QUDA_CUDA_FIELD_LOCATION); precise.exchangeGhost(); - ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, false, comm_dim, profileGFlow); + ApplyLaplace(f_temp4, f_temp2, precise, 4, a, b, f_temp2, parity, comm_dim, profileGFlow); // [2] <- [4] = Laplace [2] f_temp2 = f_temp4; diff --git a/lib/laplace.cu b/lib/laplace.cu index aab6dd2d8e..5e5c3803eb 100644 --- a/lib/laplace.cu +++ b/lib/laplace.cu @@ -151,18 +151,18 @@ namespace quda LaplaceApply(cvector_ref &out, cvector_ref &in, cvector_ref &x, const GaugeField &U, int dir, double a, double b, int parity, - bool dagger, const int *comm_override, TimeProfile &profile) + const int *comm_override, TimeProfile &profile) { constexpr int nDim = 4; auto halo = ColorSpinorField::create_comms_batch(in); if (in.Nspin() == 1) { constexpr int nSpin = 1; - LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override); + LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, comm_override); Laplace laplace(arg, out, in, halo); dslash::DslashPolicyTune policy(laplace, in, halo, profile); } else if (in.Nspin() == 4) { constexpr int nSpin = 4; - LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, dagger, comm_override); + LaplaceArg arg(out, in, halo, U, dir, a, b, x, parity, comm_override); Laplace laplace(arg, out, in, halo); dslash::DslashPolicyTune policy(laplace, in, halo, profile); } else { @@ -175,11 +175,11 @@ namespace quda // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x) // Omits direction 'dir' from the operator. void ApplyLaplace(cvector_ref &out, cvector_ref &in, const GaugeField &U, - int dir, double a, double b, cvector_ref &x, int parity, bool dagger, + int dir, double a, double b, cvector_ref &x, int parity, const int *comm_override, TimeProfile &profile) { if constexpr (is_enabled()) { - instantiate(out, in, x, U, dir, a, b, parity, dagger, comm_override, profile); + instantiate(out, in, x, U, dir, a, b, parity, comm_override, profile); } else { errorQuda("Laplace operator has not been enabled"); } diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 8eda362fbe..fc7666dd60 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -224,9 +224,6 @@ module quda_fortran real(8) :: temp real(8) :: clock - ! Enable auto-tuning? - QudaTune :: tune - ! Number of steps in s-step algorithms integer(4) :: nsteps diff --git a/lib/reduce_helper.cu b/lib/reduce_helper.cu index cf95032656..72d18ba433 100644 --- a/lib/reduce_helper.cu +++ b/lib/reduce_helper.cu @@ -44,7 +44,7 @@ namespace quda void apply(const qudaStream_t &stream) { // intentionally do not autotune, since this can be called inside a tuning region - auto tp = tuneLaunch(*this, QUDA_TUNE_NO, getVerbosity()); + auto tp = tuneLaunch(*this, false, getVerbosity()); launch_device(tp, stream, init_arg(reduce_count, n_reduce)); } }; diff --git a/lib/solve.cpp b/lib/solve.cpp index c9ba9baf21..79fa868633 100644 --- a/lib/solve.cpp +++ b/lib/solve.cpp @@ -317,8 +317,8 @@ namespace quda getProfile().TPSTOP(QUDA_PROFILE_EPILOGUE); } - void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, - const bool pc_solve); + void createDiracWithEig(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, Dirac *&dEig, QudaInvertParam ¶m, bool pc_solve, + bool use_smeared_gauge); extern std::vector solutionResident; @@ -349,7 +349,8 @@ namespace quda // Create the dirac operator and operators for sloppy, precondition, // and an eigensolver - createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve); + createDiracWithEig(dirac, diracSloppy, diracPre, diracEig, param, pc_solve, + param.eig_param ? static_cast(param.eig_param)->use_smeared_gauge : false); // wrap CPU host side pointers ColorSpinorParam cpuParam(hp_b[0], param, u.X(), pc_solution, param.input_location); diff --git a/lib/tune.cpp b/lib/tune.cpp index 3bcd7da964..2aefd3988f 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -152,9 +152,37 @@ namespace quda const map &getTuneCache() { return tunecache; } /** - * Deserialize tunecache from an istream, useful for reading a file or receiving from other nodes. + * @brief Distribute the tunecache from a given rank to all other nodes. + * @param[in] root_rank From which global rank to do the broadcast + * @param[out] tc Where we wish to receive the tunecache. This + * defaults to the local tunecache. + */ + static void broadcastTuneCache(int32_t root_rank = 0, map &tc_recv = tunecache); + + void joinTuneCache(const std::vector &global_tune_rank) + { + // vector of the split tunecaches + std::vector split_tc(global_tune_rank.size()); + // broadcast each tunecache to every process + for (auto i = 0u; i < global_tune_rank.size(); i++) { + broadcastTuneCache(global_tune_rank[i], split_tc[i]); + if (comm_rank() == global_tune_rank[i]) split_tc[i] = tunecache; + logQuda(QUDA_DEBUG_VERBOSE, "i = %d tune_rank = %d tc size = %lu\n", i, global_tune_rank[i], split_tc[i].size()); + } + + // now merge the maps + tunecache = split_tc[0]; + for (auto i = 1u; i < global_tune_rank.size(); i++) { tunecache.merge(split_tc[i]); } + } + + /** + * Deserialize tunecache from an istream, useful for reading a file + * or receiving from other nodes. + * @param[in] in The stream from which we are deserializing + * @param[out] tc The tunecache to which we are deserializing. This + * defaults to the local tunecache. */ - static void deserializeTuneCache(std::istream &in) + static void deserializeTuneCache(std::istream &in, map &tc = tunecache) { std::string line; std::stringstream ls; @@ -185,7 +213,7 @@ namespace quda ls.ignore(1); // throw away tab before comment getline(ls, param.comment); // assume anything remaining on the line is a comment param.comment += "\n"; // our convention is to include the newline, since ctime() likes to do this - tunecache[key] = param; + tc[key] = param; } } @@ -320,30 +348,26 @@ namespace quda } } - /** - * @brief Distribute the tunecache from a given rank to all other nodes. - * @param[in] root_rank From which global rank to do the broadcast - */ - static void broadcastTuneCache(int32_t root_rank = 0) + static void broadcastTuneCache(int32_t root_rank, map &tc_recv) { std::stringstream serialized; size_t size; - if (comm_rank_global() == root_rank) { + if (comm_rank() == root_rank) { serializeTuneCache(serialized); size = serialized.str().length(); } - comm_broadcast_global(&size, sizeof(size_t), root_rank); + comm_broadcast(&size, sizeof(size_t), root_rank); if (size > 0) { - if (comm_rank_global() == root_rank) { - comm_broadcast_global(const_cast(serialized.str().c_str()), size, root_rank); + if (comm_rank() == root_rank) { + comm_broadcast(const_cast(serialized.str().c_str()), size, root_rank); } else { std::vector serstr(size + 1); - comm_broadcast_global(serstr.data(), size, root_rank); + comm_broadcast(serstr.data(), size, root_rank); serstr[size] = '\0'; // null-terminate serialized.str(serstr.data()); - deserializeTuneCache(serialized); + deserializeTuneCache(serialized, tc_recv); } } } @@ -353,7 +377,7 @@ namespace quda */ void loadTuneCache() { - if (getTuning() == QUDA_TUNE_NO) { + if (!getTuning()) { warningQuda("Autotuning disabled"); return; } @@ -826,20 +850,20 @@ namespace quda */ void broadcast(int32_t root_rank) { - size_t size; + size_t size = 0; std::string serialized; - if (comm_rank_global() == root_rank) { + if (comm_rank() == root_rank) { serialized = serialize(); size = serialized.length(); } - comm_broadcast_global(&size, sizeof(size_t), root_rank); + comm_broadcast(&size, sizeof(size_t), root_rank); if (size > 0) { - if (comm_rank_global() == root_rank) { - comm_broadcast_global(const_cast(serialized.c_str()), size, root_rank); + if (comm_rank() == root_rank) { + comm_broadcast(const_cast(serialized.c_str()), size, root_rank); } else { std::vector serstr(size + 1); - comm_broadcast_global(serstr.data(), size, root_rank); + comm_broadcast(serstr.data(), size, root_rank); serstr[size] = '\0'; // null-terminate std::string_view deserialized(serstr.data()); deserialize(deserialized); @@ -859,7 +883,7 @@ namespace quda * Return the optimal launch parameters for a given kernel, either * by retrieving them from tunecache or autotuning on the spot. */ - TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity) + TuneParam tuneLaunch(Tunable &tunable, bool enabled, QudaVerbosity verbosity) { #ifdef LAUNCH_TIMER launchTimer.TPSTART(QUDA_PROFILE_TOTAL); @@ -880,7 +904,7 @@ namespace quda it = tunecache.find(key); // first check if we have the tuned value and return if we have it - if (enabled == QUDA_TUNE_YES && it != tunecache.end()) { + if (enabled && it != tunecache.end()) { #ifdef LAUNCH_TIMER launchTimer.TPSTOP(QUDA_PROFILE_PREAMBLE); @@ -926,7 +950,7 @@ namespace quda static TuneParam param; - if (enabled == QUDA_TUNE_NO) { + if (!enabled) { TuneParam param_default; param_default.aux = make_int4(-1, -1, -1, -1); tunable.defaultTuneParam(param_default); @@ -959,7 +983,7 @@ namespace quda - we are tuning an uber kernel in which case do the tuning on all ranks since we can't guarantee that all nodes are partaking */ - if (comm_rank_global() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) { + if (comm_rank() == tune_rank || !commGlobalReduction() || policyTuning() || uberTuning()) { TuneParam best_param; TuneCandidates tc(tunable.num_candidates()); float best_time; diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp index e279c0f43e..b033efb20c 100644 --- a/lib/util_quda.cpp +++ b/lib/util_quda.cpp @@ -51,17 +51,18 @@ bool getRankVerbosity() { return rank_verbosity; } +static bool tune = true; + // default has autotuning enabled but can be overridden with the QUDA_ENABLE_TUNING environment variable -QudaTune getTuning() { +bool getTuning() +{ static bool init = false; - static QudaTune tune = QUDA_TUNE_YES; - if (!init) { char *enable_tuning = getenv("QUDA_ENABLE_TUNING"); - if (!enable_tuning || strcmp(enable_tuning,"0")!=0) { - tune = QUDA_TUNE_YES; + if (!enable_tuning || strcmp(enable_tuning, "0") != 0) { + tune = true; } else { - tune = QUDA_TUNE_NO; + tune = false; } init = true; } @@ -69,6 +70,34 @@ QudaTune getTuning() { return tune; } +void setTuning(bool tuning) +{ + // first check if tuning is disabled, in which case we do nothing + static bool init = false; + static bool tune_disable = false; + if (!init) { + char *enable_tuning = getenv("QUDA_ENABLE_TUNING"); + tune_disable = (enable_tuning && strcmp(enable_tuning, "0") == 0); + init = true; + } + if (!tune_disable) tune = tuning; +} + +static std::stack tstack; + +void pushTuning(bool tuning) +{ + tstack.push(getTuning()); + setTuning(tuning); +} + +void popTuning() +{ + if (tstack.empty()) errorQuda("popTuning() called with empty stack"); + setTuning(tstack.top()); + tstack.pop(); +} + void setOutputPrefix(const char *prefix) { strncpy(prefix_, prefix, MAX_PREFIX_SIZE); @@ -80,7 +109,6 @@ void setOutputFile(FILE *outfile) outfile_ = outfile; } - static std::stack vstack; void pushVerbosity(QudaVerbosity verbosity) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2beb90ceae..da3e7a97bf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,7 +110,7 @@ if(QUDA_DIRAC_WILSON install(TARGETS deflated_invert_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() -if(QUDA_DIRAC_STAGGERED) +if(QUDA_DIRAC_STAGGERED OR QUDA_DIRAC_LAPLACE) add_executable(staggered_dslash_test staggered_dslash_test.cpp) target_link_libraries(staggered_dslash_test ${TEST_LIBS}) quda_checkbuildtest(staggered_dslash_test QUDA_BUILD_ALL_TESTS) @@ -291,6 +291,8 @@ if(QUDA_MPI OR QUDA_QMP) message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes") set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS} CACHE STRING "CTest Launcher command for QUDA's tests") +else() + set(QUDA_TEST_NUM_PROCS 1) endif() # BLAS tests @@ -1009,19 +1011,15 @@ if(QUDA_DIRAC_WILSON) --enable-testing true --gtest_output=xml:invert_test_wilson.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_wilson - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_wilson.xml) + add_test(NAME invert_test_splitgrid_wilson + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type wilson --ngcrkrylov 8 + --dim 2 4 6 8 --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_wilson.xml) - set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) endif() if(QUDA_DIRAC_TWISTED_MASS) @@ -1186,19 +1184,15 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --gtest_output=xml:invert_test_staggered_${prec}.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_staggered_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true - --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + add_test(NAME invert_test_splitgrid_staggered_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type staggered --ngcrkrylov 8 --compute-fat-long true + --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} --nsrc-tile ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_staggered_${prec}.xml) + + set_tests_properties(invert_test_splitgrid_staggered_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) add_test(NAME invert_test_asqtad_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} @@ -1207,19 +1201,13 @@ foreach(prec IN LISTS TEST_PRECS) --enable-testing true --nsrc 4 --nsrc-tile 4 --gtest_output=xml:invert_test_asqtad_${prec}.xml) - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_asqtad_${prec} - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true - --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml) - - set_tests_properties(invert_test_splitgrid_asqtad_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() + add_test(NAME invert_test_splitgrid_asqtad_${prec} + COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} + --dslash-type asqtad --ngcrkrylov 8 --compute-fat-long true + --dim 6 6 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000 + --nsrc ${QUDA_TEST_NUM_PROCS} + --enable-testing true + --gtest_output=xml:invert_test_splitgrid_asqtad_${prec}.xml) if (QUDA_DIRAC_LAPLACE) add_test(NAME invert_test_laplace_${prec} @@ -1243,21 +1231,6 @@ if (QUDA_DIRAC_DISTANCE_PRECONDITIONING) --distance-pc-alpha0 0.1 --distance-pc-t0 4 --enable-testing true --gtest_output=xml:invert_test_wilson_distance_pc.xml) - - if(DEFINED ENV{QUDA_ENABLE_TUNING}) - if($ENV{QUDA_ENABLE_TUNING} EQUAL 0) - add_test(NAME invert_test_splitgrid_wilson - COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dslash-type wilson --ngcrkrylov 8 - --dim 2 4 6 8 --niter 1000 - --distance-pc-alpha0 0.1 --distance-pc-t0 4 - --nsrc ${QUDA_TEST_NUM_PROCS} - --enable-testing true - --gtest_output=xml:invert_test_splitgrid_wilson_distance_pc.xml) - - set_tests_properties(invert_test_splitgrid_wilson PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE}) - endif() - endif() endif() if(QUDA_DIRAC_CLOVER) diff --git a/tests/clover_force_test.cpp b/tests/clover_force_test.cpp index 4a9edb57b2..5847bec40a 100644 --- a/tests/clover_force_test.cpp +++ b/tests/clover_force_test.cpp @@ -127,7 +127,7 @@ std::tuple clover_force_test(test_t param) gauge_param.gauge_order = QUDA_MILC_GAUGE_ORDER; gauge_param.overwrite_mom = 1; - if (getTuning() == QUDA_TUNE_YES) + if (getTuning()) computeTMCloverForceQuda(mom.data(), in.data(), in0.data(), coeff.data(), nvector, &gauge_param, &inv_param, detratio); diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index 3d77668d97..ec4ca5b13a 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -1062,12 +1062,12 @@ struct DslashTestWrapper { 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min, 2 * ghost_bytes); ::testing::Test::RecordProperty("Gflops", std::to_string(1.0e-9 * flops / dslash_time.event_time)); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes); } } diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp index e325511e05..a9561ca2a5 100644 --- a/tests/eigensolve_test.cpp +++ b/tests/eigensolve_test.cpp @@ -191,17 +191,14 @@ std::vector eigensolve(test_t test_param) // Host side arrays to store the eigenpairs computed by QUDA int n_eig = eig_n_conv; if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2; - std::vector evecs(n_eig); quda::ColorSpinorParam cs_param; constructWilsonTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param); // Void pointers to host side arrays, compatible with the QUDA interface. std::vector host_evecs_ptr(n_eig); // Allocate host side memory and pointers - for (int i = 0; i < n_eig; i++) { - evecs[i] = quda::ColorSpinorField(cs_param); - host_evecs_ptr[i] = evecs[i].data(); - } + std::vector evecs(n_eig, cs_param); + for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data(); // Complex eigenvalues std::vector<__complex__ double> evals(eig_n_conv); diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index 1efa49ef97..c7820d8575 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -179,7 +179,7 @@ void gauge_force_test(force_test_t test_param) errorQuda("Unsupported gauge order %d", gauge_order); } - if (getTuning() == QUDA_TUNE_YES) { + if (getTuning()) { if (compute_force) computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &gauge_param); @@ -324,7 +324,7 @@ void gauge_loop_test(loop_test_t loop_param) // compute the plaquette as part of validation obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - if (getTuning() == QUDA_TUNE_YES) { gaugeObservablesQuda(&obsParam); } + if (getTuning()) { gaugeObservablesQuda(&obsParam); } quda::host_timer_t host_timer; // Multiple execution to exclude warmup time in the first run diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 5bada81761..9b6ef102c8 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -746,17 +746,17 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx) + QudaInvertParam &inv_param, int laplace3D, int src_idx) { std::vector out_vector(1); out_vector[0] = out; - return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, src_idx); + return verifyStaggeredInversion(in, out_vector, fat_link, long_link, inv_param, laplace3D, src_idx); } std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx) + QudaInvertParam &inv_param, int laplace3D, int src_idx) { int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; double l2r_max = 0.0; @@ -783,7 +783,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, for (int i = 0; i < multishift; i++) { auto &out = out_vector[i]; double mass = 0.5 * sqrt(inv_param.offset[i]); - stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D); mxpy(in.data(), ref.data(), in.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); @@ -809,10 +809,7 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, auto &out = out_vector[0]; double mass = inv_param.mass; if (inv_param.solution_type == QUDA_MAT_SOLUTION) { - stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); - - // correct for the massRescale function inside invertQuda - if (is_laplace(dslash_type)) ax(0.5 / kappa, ref.data(), ref.Length(), ref.Precision()); + stag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D); } else if (inv_param.solution_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { @@ -820,9 +817,9 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, out, mass, 0, parity, dslash_type, laplace3D); } else if (inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, out, mass, dagger, dslash_type, laplace3D); } else { errorQuda("Invalid staggered solution type %d", inv_param.solution_type); } @@ -844,8 +841,9 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, return {l2r_max, hqr_max}; } -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link) +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector &lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link, + int laplace3D) { QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; @@ -861,10 +859,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co else sol_type = QUDA_MATDAG_MAT_SOLUTION; } else { - if (use_pc) - sol_type = QUDA_MATPC_SOLUTION; - else - sol_type = QUDA_MAT_SOLUTION; + sol_type = use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; } // Create temporary spinors @@ -872,7 +867,7 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co quda::ColorSpinorField ref(csParam); if (sol_type == QUDA_MAT_SOLUTION) { - stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D); } else if (sol_type == QUDA_MATPC_SOLUTION) { QudaParity parity = QUDA_INVALID_PARITY; switch (inv_param.matpc_type) { @@ -880,26 +875,60 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co case QUDA_MATPC_ODD_ODD: parity = QUDA_ODD_PARITY; break; default: errorQuda("Unexpected matpc_type %s", get_matpc_str(inv_param.matpc_type)); break; } - stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type); + stag_matpc(ref, fat_link, long_link, spinor, mass, 0, parity, dslash_type, laplace3D); } else if (sol_type == QUDA_MATDAG_MAT_SOLUTION) { - stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type); + stag_matdag_mat(ref, fat_link, long_link, spinor, mass, dagger, dslash_type, laplace3D); } - // Compute M * x - \lambda * x - caxpy(-lambda, spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); - double l2r = sqrt(nrm2 / src2); + if (laplace3D == 3) { + auto batch_size = (spinor.VolumeCB() / spinor.X()[3]) * stag_spinor_site_size; + auto t_offset = spinor.X()[3] * comm_coord(3); + std::vector nrm2(spinor.X()[3] * comm_dim(3), 0.0); + std::vector src2(spinor.X()[3] * comm_dim(3), 0.0); + // Compute M * x - \lambda * x on each slice + for (auto t = 0; t < spinor.X()[3]; t++) { + auto t_global = t_offset + t; + auto offset = t * batch_size * inv_param.cpu_prec; - printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + for (int parity = 0; parity < spinor.SiteSubset(); parity++) { + caxpy(-lambda[t_global], static_cast(spinor.data()) + offset, static_cast(ref.data()) + offset, + batch_size, inv_param.cpu_prec); - return l2r; + nrm2[t_global] += norm_2(static_cast(ref.data()) + offset, batch_size, inv_param.cpu_prec, false); + src2[t_global] += norm_2(static_cast(spinor.data()) + offset, batch_size, inv_param.cpu_prec, false); + + offset += spinor.VolumeCB() * stag_spinor_site_size * inv_param.cpu_prec; + } + } + + comm_allreduce_sum(nrm2); + comm_allreduce_sum(src2); + + for (auto t = 0; t < spinor.X()[3] * comm_dim(3); t++) { + auto l = ((double *)&(lambda[t]))[0]; + printfQuda("Eigenvector %4d, t = %d lambda = %15.14e: tol %.2e, host residual = %.15e\n", i, t, l, eig_param.tol, + sqrt(nrm2[t] / src2[t])); + } + auto total_nrm2 = std::accumulate(nrm2.begin(), nrm2.end(), 0); + auto total_src2 = std::accumulate(src2.begin(), src2.end(), 0); + + return sqrt(total_nrm2 / total_src2); + } else { + // Compute M * x - \lambda * x + caxpy(-lambda[0], spinor.data(), ref.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(spinor.data(), spinor.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + double l2r = sqrt(nrm2 / src2); + printfQuda("Eigenvector %4d: tol %.2e, host residual = %.15e\n", i, eig_param.tol, l2r); + return l2r; + } } double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, - double _Complex sigma, int i, QudaEigParam &eig_param, - quda::GaugeField &fat_link, quda::GaugeField &long_link) + const std::vector &sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D) { + if (laplace3D == 3) errorQuda("3-d Laplace operator not supported"); QudaInvertParam &inv_param = *(eig_param.invert_param); int dagger = inv_param.dagger == QUDA_DAG_YES ? 1 : 0; bool use_pc = (eig_param.use_pc == QUDA_BOOLEAN_TRUE ? true : false); @@ -912,10 +941,10 @@ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, qu quda::ColorSpinorField ref(csParam); // Only `mat` is used here - stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type); + stag_mat(ref, fat_link, long_link, spinor_left, mass, dagger, dslash_type, laplace3D); // Compute M * x_left - \sigma * x_right - caxpy(-sigma, spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec); + caxpy(-sigma[0], spinor_right.data(), ref.data(), spinor_right.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double nrm2 = norm_2(ref.data(), ref.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double src2 = norm_2(spinor_left.data(), spinor_left.Volume() * stag_spinor_site_size, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index 3c9543107e..307bb1db70 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -217,11 +217,13 @@ std::array verifyWilsonTypeInversion(void *spinorOut, void **spinorOu * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace * @param inv_param Invert params, used to query the solve type, etc + * @param laplace3D Whether we are working on the 3-d Laplace operator + * @param src_idx The source index we working on (when doing mutil-RHS) * @return The residual and HQ residual (if requested) */ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda::ColorSpinorField &out, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx); + QudaInvertParam &inv_param, int laplace3D, int src_idx); /** * @brief Verify a single- or multi-shift staggered inversion on the host @@ -231,26 +233,30 @@ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, quda: * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace * @param inv_param Invert params, used to query the solve type, etc, also includes the shifts + * @param laplace3D Whether we are working on the 3-d Laplace operator + * @param src_idx The source index we working on (when doing mutil-RHS) * @return The residual and HQ residual (if requested) */ std::array verifyStaggeredInversion(quda::ColorSpinorField &in, std::vector &out_vector, quda::GaugeField &fat_link, quda::GaugeField &long_link, - QudaInvertParam &inv_param, int src_idx = 0); + QudaInvertParam &inv_param, int laplace3D, int src_idx = 0); /** * @brief Verify a staggered-type eigenvector * * @param spinor The host eigenvector to be verified - * @param lambda The host eigenvalue to be verified + * @param lambda The host eigenvalue(s) to be verified * @param i The number of the eigenvalue, only used when printing outputs * @param eig_param Eigensolve params, used to query the operator type, etc * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace + * @param laplace3D Whether we are working on the 3-d Laplace operator * @return The residual norm */ -double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Complex lambda, int i, - QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link); +double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, const std::vector &lambda, int i, + QudaEigParam &eig_param, quda::GaugeField &fat_link, quda::GaugeField &long_link, + int laplace3D); /** * @brief Verify a staggered-type singular vector @@ -262,11 +268,12 @@ double verifyStaggeredTypeEigenvector(quda::ColorSpinorField &spinor, double _Co * @param eig_param Eigensolve params, used to query the operator type, etc * @param fat_link The fat links in the context of an ASQTAD solve; otherwise the base gauge links with phases applied * @param long_link The long links; null for naive staggered and Laplace + * @param laplace3D Whether we are working on the 3-d Laplace operator * @return The residual norm */ double verifyStaggeredTypeSingularVector(quda::ColorSpinorField &spinor_left, quda::ColorSpinorField &spinor_right, - double _Complex sigma, int i, QudaEigParam &eig_param, - quda::GaugeField &fat_link, quda::GaugeField &long_link); + const std::vector &sigma, int i, QudaEigParam &eig_param, + quda::GaugeField &fat_link, quda::GaugeField &long_link, int laplace3D); /** * @brief Verify the spinor distance reweighting diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 6ecbfe94de..2e7e4e0f8a 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -29,13 +29,19 @@ * @param[in] oddBit The odd/even bit for the site index * @param[in] daggerBit Perform the ordinary dslash (0) or Hermitian conjugate (1) * @param[in] dslash_type The type of Dslash operation + * @param[in] laplace3D Whether we applying the 3-d laplace operator + * (in the case of dslash_type being QUDA_LAPLACE_DSLASH) */ template void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const real_t *const *longlink, const real_t *const *ghostFatlink, const real_t *const *ghostLonglink, const real_t *spinorField, const real_t *const *fwd_nbr_spinor, - const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, QudaDslashType dslash_type) + const real_t *const *back_nbr_spinor, int oddBit, int daggerBit, + QudaDslashType dslash_type, int laplace3D) { + if (laplace3D < 4 && dslash_type != QUDA_LAPLACE_DSLASH) + errorQuda("laplace3D = %d only supported for Laplace dslash (%d requested)", laplace3D, dslash_type); + #pragma omp parallel for for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0; @@ -66,6 +72,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r int offset = stag_spinor_site_size * sid; for (int dir = 0; dir < 8; dir++) { + if (laplace3D == dir / 2) continue; // skip dimensions if needed const int nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1; const real_t *fatlnk = gaugeLink(sid, dir, oddBit, fatlinkEven, fatlinkOdd, ghostFatlinkEven, ghostFatlinkOdd, 1, 1); @@ -108,7 +115,7 @@ void staggeredDslashReference(real_t *res, const real_t *const *fatlink, const r } void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D) { // assert sPrecision and gPrecision must be the same if (in.Precision() != fat_link.Precision()) { @@ -143,23 +150,22 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), reinterpret_cast(ghost_longlink), in.data(), reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type, + laplace3D); } else if (in.Precision() == QUDA_SINGLE_PRECISION) { staggeredDslashReference(out.data(), reinterpret_cast(qdp_fatlink), reinterpret_cast(qdp_longlink), reinterpret_cast(ghost_fatlink), reinterpret_cast(ghost_longlink), in.data(), reinterpret_cast(in.fwdGhostFaceBuffer), - reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type); + reinterpret_cast(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type, + laplace3D); } } void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { - errorQuda("The spinor precision and gauge precision are not the same"); - } + checkPrecision(in, fat_link); // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -169,11 +175,12 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel // {{m, -D_eo},{-D_oe,m}}, while the CPU verify function does not // have the minus sign. Inverting the expected dagger convention // solves this discrepancy. - stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type); - stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type); + stag_dslash(out.Even(), fat_link, long_link, in.Odd(), QUDA_EVEN_PARITY, 1 - daggerBit, dslash_type, laplace3D); + stag_dslash(out.Odd(), fat_link, long_link, in.Even(), QUDA_ODD_PARITY, 1 - daggerBit, dslash_type, laplace3D); if (dslash_type == QUDA_LAPLACE_DSLASH) { - double kappa = 1.0 / (8 + mass); + int dimension = laplace3D < 4 ? 3 : 4; + double kappa = 1.0 / (2 * dimension + mass); xpay(in.data(), kappa, out.data(), out.Length(), out.Precision()); } else { axpy(2 * mass, in.data(), out.data(), out.Length(), out.Precision()); @@ -181,12 +188,9 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel } void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { - errorQuda("The spinor precision and gauge precision are not the same"); - } + checkPrecision(in, fat_link); // assert we have full-parity spinors if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET || in.SiteSubset() != QUDA_FULL_SITE_SUBSET) @@ -197,15 +201,16 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga quda::ColorSpinorField tmp(csParam); // Apply mat in sequence - stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type); - stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type); + stag_mat(tmp, fat_link, long_link, in, mass, daggerBit, dslash_type, laplace3D); + stag_mat(out, fat_link, long_link, tmp, mass, 1 - daggerBit, dslash_type, laplace3D); } void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type) + const ColorSpinorField &in, double mass, int, QudaParity parity, QudaDslashType dslash_type, + int laplace3D) { - // assert sPrecision and gPrecision must be the same - if (in.Precision() != fat_link.Precision()) { errorQuda("The spinor precision and gauge precison are not the same"); } + if (laplace3D < 4) errorQuda("Cannot use 3-d operator with e/o preconditioning"); + checkPrecision(in, fat_link); // assert we have single-parity spinors if (out.SiteSubset() != QUDA_PARITY_SITE_SUBSET || in.SiteSubset() != QUDA_PARITY_SITE_SUBSET) @@ -225,8 +230,8 @@ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFi quda::ColorSpinorField tmp(csParam); // dagger bit does not matter - stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type); - stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type); + stag_dslash(tmp, fat_link, long_link, in, otherparity, 0, dslash_type, laplace3D); + stag_dslash(out, fat_link, long_link, tmp, parity, 0, dslash_type, laplace3D); double msq_x4 = mass * mass * 4; if (in.Precision() == QUDA_DOUBLE_PRECISION) { diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 34625abfa3..557be21bd0 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -21,9 +21,13 @@ void setDims(int *); * @param[in] oddBit 0 for D_eo, 1 for D_oe * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, int oddBit, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the full parity staggered-type dslash @@ -35,9 +39,13 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF * @param[in] mass Mass for the dslash operator * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the full parity staggered-type matdag_mat @@ -49,9 +57,13 @@ void stag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeFiel * @param[in] mass Mass for the dslash operator * @param[in] daggerBit 0 for the regular operator, 1 for the dagger operator * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int daggerBit, QudaDslashType dslash_type, int laplace3D); /** * @brief Apply the even-even or odd-odd preconditioned staggered dslash @@ -64,6 +76,11 @@ void stag_matdag_mat(ColorSpinorField &out, const GaugeField &fat_link, const Ga * @param[in] dagger_bit 0 for the regular operator, 1 for the dagger operator --- irrelevant for the HPD preconditioned operator * @param[in] parity Parity of preconditioned dslash * @param[in] dslash_type Dslash type + * @param[in] laplace3D Whether we applying the 3-d variant of the + * Laplace operator. A value of 4 is the regular 4-d operator, and a + * value < 4 implies a 3-d operator with the value designating the + * orthogonal dimension (e.g., the dimension not enabled). */ void stag_matpc(ColorSpinorField &out, const GaugeField &fat_link, const GaugeField &long_link, - const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type); + const ColorSpinorField &in, double mass, int dagger_bit, QudaParity parity, QudaDslashType dslash_type, + int laplace3D); diff --git a/tests/io_test.cpp b/tests/io_test.cpp index 2618b0e479..25296b02b9 100644 --- a/tests/io_test.cpp +++ b/tests/io_test.cpp @@ -31,6 +31,9 @@ TEST_P(GaugeIOTest, verify) gauge_param.cpu_prec = ::testing::get<0>(param); if (!quda::is_enabled(gauge_param.cpu_prec)) GTEST_SKIP(); gauge_param.cuda_prec = gauge_param.cpu_prec; + gauge_param.cuda_prec_sloppy = gauge_param.cpu_prec; + gauge_param.cuda_prec_precondition = gauge_param.cpu_prec; + gauge_param.cuda_prec_eigensolver = gauge_param.cpu_prec; gauge_param.t_boundary = QUDA_PERIODIC_T; // Allocate host side memory for the gauge field. diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index a9b499d71a..1b5609bfda 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -85,14 +85,16 @@ struct StaggeredDslashTestWrapper { for (int i = 0; i < Nsrc; i++) { switch (dtest_type) { case dslash_test_type::Dslash: - stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type); + stag_dslash(spinorRef[i], cpuFat, cpuLong, spinor[i], parity, dagger, dslash_type, laplace3D); break; case dslash_test_type::MatPC: - stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type); + stag_matpc(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, 0, parity, dslash_type, laplace3D); + break; + case dslash_test_type::Mat: + stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D); break; - case dslash_test_type::Mat: stag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); break; case dslash_test_type::MatDagMat: - stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type); + stag_matdag_mat(spinorRef[i], cpuFat, cpuLong, spinor[i], mass, dagger, dslash_type, laplace3D); break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); } @@ -406,12 +408,12 @@ struct StaggeredDslashTestWrapper { size_t ghost_bytes = cudaSpinor[0].GhostBytes(); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_GPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU", + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.cpu_time); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); - ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_min", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_max); + ::testing::Test::RecordProperty("Halo_bidirectional_BW_CPU_max", 1.0e-9 * 2 * ghost_bytes / dslash_time.cpu_min); ::testing::Test::RecordProperty("Halo_message_size_bytes", 2 * ghost_bytes); printfQuda( diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 25d10cf91a..95ebff044b 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -75,12 +75,6 @@ void init() gauge_param = newQudaGaugeParam(); setStaggeredGaugeParam(gauge_param); - QudaGaugeSmearParam smear_param; - if (gauge_smear) { - smear_param = newQudaGaugeSmearParam(); - setGaugeSmearParam(smear_param); - } - // Though no inversions are performed, the inv_param // structure contains all the information we need to // construct the dirac operator. @@ -166,10 +160,20 @@ std::vector eigensolve(test_t test_param) eig_param.compute_svd = ::testing::get<3>(test_param); eig_param.spectrum = ::testing::get<4>(test_param); - if (eig_param.use_pc) - eig_inv_param.solution_type = QUDA_MATPC_SOLUTION; - else - eig_inv_param.solution_type = QUDA_MAT_SOLUTION; + eig_inv_param.solution_type = eig_param.use_pc ? QUDA_MATPC_SOLUTION : QUDA_MAT_SOLUTION; + + // whether we are using the resident smeared gauge or not + eig_param.use_smeared_gauge = gauge_smear; + + if (dslash_type == QUDA_LAPLACE_DSLASH) { + int dimension = laplace3D < 4 ? 3 : 4; + eig_inv_param.kappa = 1.0 / (2 * dimension + mass); + eig_inv_param.laplace3D = laplace3D; + if (dimension == 3) { + eig_param.ortho_dim = laplace3D; + eig_param.ortho_dim_size_local = tdim; + } + } // For gtest testing, we prohibit the use of polynomial acceleration as // the fine tuning required can inhibit convergence of an otherwise @@ -192,24 +196,65 @@ std::vector eigensolve(test_t test_param) if (!enable_testing || (enable_testing && getVerbosity() >= QUDA_VERBOSE)) display_test_info(eig_param); + // Gauge Smearing Routines + if (gauge_smear) { + quda::host_timer_t host_timer; + host_timer.start(); // start the timer + + std::vector obs_param(gauge_smear_steps / measurement_interval + 1); + for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { + obs_param[i] = newQudaGaugeObservableParam(); + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; + obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE; + obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + } + + // We here set all the problem parameters for all possible smearing types. + QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); + setGaugeSmearParam(smear_param); + + switch (smear_param.smear_type) { + case QUDA_GAUGE_SMEAR_APE: + case QUDA_GAUGE_SMEAR_STOUT: + case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: + case QUDA_GAUGE_SMEAR_HYP: { + performGaugeSmearQuda(&smear_param, obs_param.data()); + break; + } + + // Here we use a typical use case which is different from simple smearing in that + // the user will want to compute the plaquette values to compute the gauge energy. + case QUDA_GAUGE_SMEAR_WILSON_FLOW: + case QUDA_GAUGE_SMEAR_SYMANZIK_FLOW: { + for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; + } + performWFlowQuda(&smear_param, obs_param.data()); + break; + } + default: errorQuda("Undefined gauge smear type %d given", smear_param.smear_type); + } + + host_timer.stop(); + printfQuda("Time for gauge smearing = %f\n", host_timer.last()); + } + // Vector construct START //---------------------------------------------------------------------------- // Host side arrays to store the eigenpairs computed by QUDA int n_eig = eig_n_conv; if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) n_eig *= 2; - std::vector evecs(n_eig); quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &eig_inv_param, &gauge_param); // Void pointers to host side arrays, compatible with the QUDA interface. std::vector host_evecs_ptr(n_eig); // Allocate host side memory and pointers - for (int i = 0; i < n_eig; i++) { - evecs[i] = quda::ColorSpinorField(cs_param); - host_evecs_ptr[i] = evecs[i].data(); - } + std::vector evecs(n_eig, cs_param); + for (int i = 0; i < n_eig; i++) host_evecs_ptr[i] = evecs[i].data(); // Complex eigenvalues - std::vector<__complex__ double> evals(eig_n_conv); + int n_batch = laplace3D == 3 ? eig_param.ortho_dim_size_local * comm_dim(3) : 1; + std::vector<__complex__ double> evals(eig_n_conv * n_batch); // Vector construct END //---------------------------------------------------------------------------- @@ -226,19 +271,20 @@ std::vector eigensolve(test_t test_param) printfQuda("Time for %s solution = %f\n", eig_param.arpack_check ? "ARPACK" : "QUDA", host_timer.last()); // Perform host side verification of eigenvector if requested. - // ... std::vector residua(eig_n_conv, 0.0); // Perform host side verification of eigenvector if requested. if (verify_results) { for (int i = 0; i < eig_n_conv; i++) { if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) { - double _Complex sigma = evals[i]; + std::vector sigma(n_batch); + for (auto b = 0; b < n_batch; b++) sigma[b] = evals[b * eig_n_conv + i]; residua[i] = verifyStaggeredTypeSingularVector(evecs[i], evecs[i + eig_n_conv], sigma, i, eig_param, cpuFatQDP, - cpuLongQDP); + cpuLongQDP, laplace3D); } else { - double _Complex lambda = evals[i]; - residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP); + std::vector lambda(n_batch); + for (auto b = 0; b < n_batch; b++) lambda[b] = evals[b * eig_n_conv + i]; + residua[i] = verifyStaggeredTypeEigenvector(evecs[i], lambda, i, eig_param, cpuFatQDP, cpuLongQDP, laplace3D); } } } @@ -285,7 +331,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); + if (is_laplace(dslash_type)) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp index c73c85378e..8918c3df4d 100644 --- a/tests/staggered_eigensolve_test_gtest.hpp +++ b/tests/staggered_eigensolve_test_gtest.hpp @@ -32,6 +32,9 @@ bool skip_test(test_t test_param) auto compute_svd = ::testing::get<3>(test_param); auto spectrum = ::testing::get<4>(test_param); + // 3-d operator only supported for Laplace + if (eig_type == QUDA_EIG_TR_LANCZOS_3D && dslash_type != QUDA_LAPLACE_DSLASH) return true; + // Reverse engineer the operator type QudaSolveType combo_solve_type = get_solve_type(use_norm_op, use_pc, compute_svd); if (combo_solve_type == QUDA_DIRECT_PC_SOLVE) { @@ -87,6 +90,7 @@ bool skip_test(test_t test_param) case QUDA_LAPLACE_DSLASH: switch (eig_type) { case QUDA_EIG_TR_LANCZOS: + case QUDA_EIG_TR_LANCZOS_3D: case QUDA_EIG_BLK_TR_LANCZOS: if (spectrum != QUDA_SPECTRUM_LR_EIG && spectrum != QUDA_SPECTRUM_SR_EIG) return true; break; @@ -117,6 +121,8 @@ TEST_P(StaggeredEigensolveTest, verify) auto eig_type = ::testing::get<0>(GetParam()); if (eig_type == QUDA_EIG_IR_ARNOLDI || eig_type == QUDA_EIG_BLK_IR_ARNOLDI) factor *= 10; auto tol = factor * eig_param.tol; + if (dslash_type == QUDA_LAPLACE_DSLASH) laplace3D = eig_type == QUDA_EIG_TR_LANCZOS_3D ? 3 : 4; + for (auto rsd : eigensolve(GetParam())) EXPECT_LE(rsd, tol); } @@ -143,6 +149,9 @@ auto hermitian_solvers = Values(QUDA_EIG_TR_LANCZOS, QUDA_EIG_BLK_TR_LANCZOS, QU // Can solve non-hermitian systems auto non_hermitian_solvers = Values(QUDA_EIG_IR_ARNOLDI); +// Batched solvers for 3-d operators +auto batched_solvers = Values(QUDA_EIG_TR_LANCZOS_3D); + // Eigensolver spectrum types auto hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG); auto non_hermitian_spectrum = Values(QUDA_SPECTRUM_LR_EIG, QUDA_SPECTRUM_SR_EIG, QUDA_SPECTRUM_LM_EIG, @@ -171,3 +180,9 @@ INSTANTIATE_TEST_SUITE_P(DirectFull, StaggeredEigensolveTest, ::testing::Combine(hermitian_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), non_hermitian_spectrum), gettestname); + +// 3-d full system direct solve +INSTANTIATE_TEST_SUITE_P(DirectFull3D, StaggeredEigensolveTest, + ::testing::Combine(batched_solvers, Values(QUDA_BOOLEAN_FALSE), Values(QUDA_BOOLEAN_FALSE), + Values(QUDA_BOOLEAN_FALSE), hermitian_spectrum), + gettestname); diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 4177feb954..77d9cd5a75 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -450,9 +450,9 @@ std::vector> solve(test_t param) // Create an appropriate subset of the full out_multishift vector std::vector out_subset = {out_multishift.begin() + n * multishift, out_multishift.begin() + (n + 1) * multishift}; - res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param); + res[n] = verifyStaggeredInversion(in[n], out_subset, cpuFatQDP, cpuLongQDP, inv_param, laplace3D); } else { - res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, n); + res[n] = verifyStaggeredInversion(in[n], out[n], cpuFatQDP, cpuLongQDP, inv_param, laplace3D, n); } } } @@ -510,7 +510,8 @@ int main(int argc, char **argv) if (!is_staggered(dslash_type) && !is_laplace(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } else { - if (is_laplace(dslash_type)) errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); + if (is_laplace(dslash_type)) + errorQuda("The Laplace dslash is not enabled, cmake configure with -DQUDA_DIRAC_LAPLACE=ON"); if (!is_staggered(dslash_type)) errorQuda("dslash_type %s not supported", get_dslash_str(dslash_type)); } diff --git a/tests/su3_test.cpp b/tests/su3_test.cpp index 265fed1248..5ddfc52956 100644 --- a/tests/su3_test.cpp +++ b/tests/su3_test.cpp @@ -18,19 +18,6 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -// Smearing variables -double gauge_smear_rho = 0.1; -double gauge_smear_epsilon = 0.1; -double gauge_smear_alpha = 0.6; -double gauge_smear_alpha1 = 0.75; -double gauge_smear_alpha2 = 0.6; -double gauge_smear_alpha3 = 0.3; -int gauge_smear_steps = 50; -QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; -int gauge_smear_dir_ignore = -1; -int measurement_interval = 5; -bool su_project = true; - void display_test_info() { printfQuda("running the following test:\n"); @@ -68,49 +55,6 @@ void display_test_info() return; } -void add_su3_option_group(std::shared_ptr quda_app) -{ - CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, - {"stout", QUDA_GAUGE_SMEAR_STOUT}, - {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, - {"hyp", QUDA_GAUGE_SMEAR_HYP}, - {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, - {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; - - // Option group for SU(3) related options - auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests"); - - opgroup - ->add_option( - "--su3-smear-type", - gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") - ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); - ; - opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); - - opgroup->add_option("--su3-smear-rho", gauge_smear_rho, - "rho coefficient for Stout and Over-Improved Stout smearing (default 0.1)"); - - opgroup->add_option("--su3-smear-epsilon", gauge_smear_epsilon, - "epsilon coefficient for Over-Improved Stout smearing or Wilson flow (default 0.1)"); - - opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); - opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); - opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - - opgroup->add_option( - "--su3-smear-dir-ignore", gauge_smear_dir_ignore, - "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); - - opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 50)"); - - opgroup->add_option("--su3-measurement-interval", measurement_interval, - "Measure the field energy and/or topological charge every Nth step (default 5) "); - - opgroup->add_option("--su3-project", su_project, - "Project smeared gauge onto su3 manifold at measurement interval (default true)"); -} - int main(int argc, char **argv) { @@ -257,7 +201,7 @@ int main(int argc, char **argv) QudaGaugeObservableParam *obs_param = new QudaGaugeObservableParam[gauge_smear_steps / measurement_interval + 1]; for (int i = 0; i < gauge_smear_steps / measurement_interval + 1; i++) { obs_param[i] = newQudaGaugeObservableParam(); - obs_param[i].compute_plaquette = QUDA_BOOLEAN_FALSE; + obs_param[i].compute_plaquette = QUDA_BOOLEAN_TRUE; obs_param[i].compute_qcharge = QUDA_BOOLEAN_TRUE; obs_param[i].su_project = su_project ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; } diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 43d2745035..6cfe9cf0ad 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -73,7 +73,6 @@ quda::mgarray mg_vec_partfile = {}; QudaInverterType inv_type; bool inv_deflate = false; bool inv_multigrid = false; -bool gauge_smear = false; QudaInverterType precon_type = QUDA_INVALID_INVERTER; QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ; QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR; @@ -302,13 +301,18 @@ double eofa_mq2 = 0.85; double eofa_mq3 = 1.0; // SU(3) smearing options +bool gauge_smear = false; +QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; double gauge_smear_rho = 0.1; -double gauge_smear_epsilon = 1.0; +double gauge_smear_epsilon = 0.1; double gauge_smear_alpha = 0.6; +double gauge_smear_alpha1 = 0.75; +double gauge_smear_alpha2 = 0.6; +double gauge_smear_alpha3 = 0.3; int gauge_smear_steps = 5; -QudaWFlowType wflow_type = QUDA_WFLOW_TYPE_WILSON; +int gauge_smear_dir_ignore = -1; int measurement_interval = 5; -QudaGaugeSmearType gauge_smear_type = QUDA_GAUGE_SMEAR_STOUT; +bool su_project = true; // contract options QudaContractType contract_type = QUDA_CONTRACT_TYPE_STAGGERED_FT_T; @@ -410,6 +414,7 @@ namespace CLI::TransformPairs eig_type_map {{"trlm", QUDA_EIG_TR_LANCZOS}, {"blktrlm", QUDA_EIG_BLK_TR_LANCZOS}, + {"trlm-3d", QUDA_EIG_TR_LANCZOS_3D}, {"iram", QUDA_EIG_IR_ARNOLDI}, {"blkiram", QUDA_EIG_BLK_IR_ARNOLDI}}; @@ -453,11 +458,12 @@ namespace {"SR", QUDA_SPECTRUM_SR_EIG}, {"LR", QUDA_SPECTRUM_LR_EIG}, {"SM", QUDA_SPECTRUM_SM_EIG}, {"LM", QUDA_SPECTRUM_LM_EIG}, {"SI", QUDA_SPECTRUM_SI_EIG}, {"LI", QUDA_SPECTRUM_LI_EIG}}; - CLI::TransformPairs wflow_type_map {{"wilson", QUDA_WFLOW_TYPE_WILSON}, - {"symanzik", QUDA_WFLOW_TYPE_SYMANZIK}}; - - CLI::TransformPairs gauge_smear_type_map { - {"ape", QUDA_GAUGE_SMEAR_APE}, {"stout", QUDA_GAUGE_SMEAR_STOUT}, {"ovr-imp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}}; + CLI::TransformPairs gauge_smear_type_map {{"ape", QUDA_GAUGE_SMEAR_APE}, + {"stout", QUDA_GAUGE_SMEAR_STOUT}, + {"ovrimp-stout", QUDA_GAUGE_SMEAR_OVRIMP_STOUT}, + {"hyp", QUDA_GAUGE_SMEAR_HYP}, + {"wilson", QUDA_GAUGE_SMEAR_WILSON_FLOW}, + {"symanzik", QUDA_GAUGE_SMEAR_SYMANZIK_FLOW}}; CLI::TransformPairs setup_type_map {{"test", QUDA_TEST_VECTOR_SETUP}, {"null", QUDA_TEST_VECTOR_SETUP}}; @@ -765,7 +771,6 @@ std::shared_ptr make_app(std::string app_description, std::string app_n void add_eigen_option_group(std::shared_ptr quda_app) { - CLI::QUDACheckedTransformer prec_transform(precision_map); // Option group for Eigensolver related options auto opgroup = quda_app->add_option_group("Eigensolver", "Options controlling eigensolver"); @@ -1118,9 +1123,16 @@ void add_eofa_option_group(std::shared_ptr quda_app) void add_su3_option_group(std::shared_ptr quda_app) { - // Option group for SU(3) related options auto opgroup = quda_app->add_option_group("SU(3)", "Options controlling SU(3) tests"); + + opgroup + ->add_option( + "--su3-smear-type", + gauge_smear_type, "The type of action to use in the smearing. Options: APE, Stout, Over Improved Stout, HYP, Wilson Flow, Symanzik Flow (default stout)") + ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); + ; + opgroup->add_option("--su3-smear-alpha", gauge_smear_alpha, "alpha coefficient for APE smearing (default 0.6)"); opgroup->add_option("--su3-smear-rho", gauge_smear_rho, @@ -1128,18 +1140,23 @@ void add_su3_option_group(std::shared_ptr quda_app) opgroup->add_option( "--su3-smear-epsilon", gauge_smear_epsilon, - "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 1.0)"); + "epsilon coefficient for Over-Improved Stout smearing and step size for Wilson flow (default 0.1)"); - opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)"); + opgroup->add_option("--su3-smear-alpha1", gauge_smear_alpha1, "alpha1 coefficient for HYP smearing (default 0.75)"); + opgroup->add_option("--su3-smear-alpha2", gauge_smear_alpha2, "alpha2 coefficient for HYP smearing (default 0.6)"); + opgroup->add_option("--su3-smear-alpha3", gauge_smear_alpha3, "alpha3 coefficient for HYP smearing (default 0.3)"); - opgroup->add_option("--su3-wflow-type", wflow_type, "The type of action to use in the wilson flow (default wilson)") - ->transform(CLI::QUDACheckedTransformer(wflow_type_map)); + opgroup->add_option( + "--su3-smear-dir-ignore", gauge_smear_dir_ignore, + "Direction to be ignored by the smearing, negative value means decided by --su3-smear-type (default -1)"); - opgroup->add_option("--su3-smear-type", gauge_smear_type, "The type of smearing to use (default stout)") - ->transform(CLI::QUDACheckedTransformer(gauge_smear_type_map)); + opgroup->add_option("--su3-smear-steps", gauge_smear_steps, "The number of smearing steps to perform (default 10)"); opgroup->add_option("--su3-measurement-interval", measurement_interval, - "Measure the field energy and topological charge every Nth step (default 5) "); + "Measure the field energy and/or topological charge every Nth step (default 5) "); + + opgroup->add_option("--su3-project", su_project, + "Project smeared gauge onto su3 manifold at measurement interval (default true)"); } void add_madwf_option_group(std::shared_ptr quda_app) diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index 338dcc73bc..38e81ebdc0 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -554,10 +554,14 @@ extern double eofa_mq3; extern double gauge_smear_rho; extern double gauge_smear_epsilon; extern double gauge_smear_alpha; +extern double gauge_smear_alpha1; +extern double gauge_smear_alpha2; +extern double gauge_smear_alpha3; extern int gauge_smear_steps; -extern QudaWFlowType wflow_type; +extern int gauge_smear_dir_ignore; extern int measurement_interval; extern QudaGaugeSmearType gauge_smear_type; +extern bool su_project; extern double smear_coeff; extern int smear_n_steps; diff --git a/tests/utils/host_blas.cpp b/tests/utils/host_blas.cpp index c9d8c63a59..e918f77589 100644 --- a/tests/utils/host_blas.cpp +++ b/tests/utils/host_blas.cpp @@ -63,20 +63,20 @@ void mxpy(void *x, void *y, int len, QudaPrecision precision) } // returns the square of the L2 norm of the vector -template inline double norm2(real_t *v, int len) +template inline double norm2(real_t *v, int len, bool global) { double sum = 0.0; for (int i = 0; i < len; i++) sum += v[i] * v[i]; - quda::comm_allreduce_sum(sum); + if (global) quda::comm_allreduce_sum(sum); return sum; } -double norm_2(void *v, int len, QudaPrecision precision) +double norm_2(void *v, int len, QudaPrecision precision, bool global) { if (precision == QUDA_DOUBLE_PRECISION) - return norm2((double *)v, len); + return norm2((double *)v, len, global); else - return norm2((float *)v, len); + return norm2((float *)v, len, global); } // performs the operation y[i] = x[i] + a*y[i] diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index c63d124dc8..473a527ac5 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -250,7 +250,7 @@ void exchange_llfat_init(QudaPrecision prec); void exchange_llfat_cleanup(void); // Implemented in host_blas.cpp -double norm_2(void *vector, int len, QudaPrecision precision); +double norm_2(void *vector, int len, QudaPrecision precision, bool global = true); void mxpy(void *x, void *y, int len, QudaPrecision precision); void ax(double a, void *x, int len, QudaPrecision precision); void cax(double _Complex a, void *x, int len, QudaPrecision precision); diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index 8ec29c514b..58cc3fb758 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -221,6 +221,7 @@ const char *get_eig_type_str(QudaEigType type) switch (type) { case QUDA_EIG_TR_LANCZOS: ret = "trlm"; break; case QUDA_EIG_BLK_TR_LANCZOS: ret = "blktrlm"; break; + case QUDA_EIG_TR_LANCZOS_3D: ret = "trlm_3d"; break; case QUDA_EIG_IR_ARNOLDI: ret = "iram"; break; case QUDA_EIG_BLK_IR_ARNOLDI: ret = "blkiram"; break; default: ret = "unknown eigensolver"; break; diff --git a/tests/utils/set_params.cpp b/tests/utils/set_params.cpp index 7dd43a651a..98bf9da590 100644 --- a/tests/utils/set_params.cpp +++ b/tests/utils/set_params.cpp @@ -5,12 +5,16 @@ void setGaugeSmearParam(QudaGaugeSmearParam &smear_param) { + smear_param.smear_type = gauge_smear_type; smear_param.alpha = gauge_smear_alpha; smear_param.rho = gauge_smear_rho; smear_param.epsilon = gauge_smear_epsilon; smear_param.n_steps = gauge_smear_steps; smear_param.meas_interval = measurement_interval; - smear_param.smear_type = gauge_smear_type; + smear_param.alpha1 = gauge_smear_alpha1; + smear_param.alpha2 = gauge_smear_alpha2; + smear_param.alpha3 = gauge_smear_alpha3; + smear_param.dir_ignore = gauge_smear_dir_ignore; smear_param.struct_size = sizeof(smear_param); } @@ -125,16 +129,17 @@ void setInvertParam(QudaInvertParam &inv_param) { // Set dslash type inv_param.dslash_type = dslash_type; + int dimension = laplace3D < 4 ? 3 : 4; // Use kappa or mass normalisation if (kappa == -1.0) { inv_param.mass = mass; inv_param.kappa = 1.0 / (2.0 * (1 + 3 / anisotropy + mass)); - if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (8 + mass); + if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.kappa = 1.0 / (2 * dimension + mass); } else { inv_param.kappa = kappa; inv_param.mass = 0.5 / kappa - (1.0 + 3.0 / anisotropy); - if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 8.0; + if (dslash_type == QUDA_LAPLACE_DSLASH) inv_param.mass = 1.0 / kappa - 2 * dimension; } if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Kappa = %.8f Mass = %.8f\n", inv_param.kappa, inv_param.mass); @@ -935,7 +940,8 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) // Solver params inv_param.verbosity = verbosity; inv_param.mass = mass; - inv_param.kappa = kappa = 1.0 / (8.0 + mass); // for Laplace operator + int dimension = laplace3D < 4 ? 3 : 4; + inv_param.kappa = 1.0 / (2 * dimension + mass); // for Laplace operator inv_param.laplace3D = laplace3D; // for Laplace operator if (Nsrc < Nsrc_tile || Nsrc % Nsrc_tile != 0) @@ -1002,7 +1008,7 @@ void setStaggeredInvertParam(QudaInvertParam &inv_param) inv_param.solve_type = solve_type; inv_param.matpc_type = matpc_type; inv_param.dagger = QUDA_DAG_NO; - inv_param.mass_normalization = QUDA_MASS_NORMALIZATION; + inv_param.mass_normalization = dslash_type == QUDA_LAPLACE_DSLASH ? QUDA_KAPPA_NORMALIZATION : QUDA_MASS_NORMALIZATION; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = prec;