From aeba7d72c809e06eae446c5da53703736b694ab5 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 29 Jan 2024 12:10:09 -0800 Subject: [PATCH 01/21] Prolongator now multi-RHS aware --- include/kernels/prolongator.cuh | 62 ++++++++++++++++++++---------- include/transfer.h | 8 ++-- lib/prolongator.in.cpp | 14 +++---- lib/prolongator.in.cu | 61 ++++++++++++++++------------- lib/staggered_prolong_restrict.cu | 8 ++++ lib/transfer.cpp | 64 ++++++++++++++++++++----------- 6 files changed, 137 insertions(+), 80 deletions(-) diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh index 1075b5e8d5..d4296d41b1 100644 --- a/include/kernels/prolongator.cuh +++ b/include/kernels/prolongator.cuh @@ -22,37 +22,55 @@ namespace quda { static constexpr int fineColor = fineColor_; static constexpr int coarseColor = coarseColor_; - FieldOrderCB(fineSpin)> out;; - const FieldOrderCB(coarseSpin)> in; - const FieldOrderCB(fineSpin), vFloat> V; + // disable ghost to reduce arg size + using F = FieldOrderCB(fineSpin), Float, Float, true>; + using C = FieldOrderCB(coarseSpin), Float, Float, true>; + using V = FieldOrderCB(fineSpin), vFloat, vFloat>; + + static constexpr unsigned int max_n_src = 64; + const int_fastdiv n_src; + F out[max_n_src]; + C in[max_n_src]; + const V v; const int *geo_map; // need to make a device copy of this const spin_mapper spin_map; const int parity; // the parity of the output field (if single parity) const int nParity; // number of parities of input fine field - ProlongateArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V, + ProlongateArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *geo_map, const int parity) : - kernel_param(dim3(out.VolumeCB(), out.SiteSubset(), fineColor/fine_colors_per_thread())), - out(out), in(in), V(V), geo_map(geo_map), spin_map(), parity(parity), nParity(out.SiteSubset()) - { } + kernel_param(dim3(out[0].VolumeCB(), out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread())), + n_src(out.size()), + v(v), + geo_map(geo_map), + spin_map(), + parity(parity), + nParity(out[0].SiteSubset()) + { + if (out.size() > max_n_src) errorQuda("vector set size %lu greater than max size %d", out.size(), max_n_src); + for (auto i = 0u; i < out.size(); i++) { + this->out[i] = out[i]; + this->in[i] = in[i]; + } + } }; /** Applies the grid prolongation operator (coarse to fine) */ template - __device__ __host__ inline void prolongate(complex out[], const Arg &arg, int parity, int x_cb) + __device__ __host__ inline void prolongate(complex out[], const Arg &arg, int src_idx, int parity, int x_cb) { - int x = parity*arg.out.VolumeCB() + x_cb; + int x = parity * arg.out[src_idx].VolumeCB() + x_cb; int x_coarse = arg.geo_map[x]; - int parity_coarse = (x_coarse >= arg.in.VolumeCB()) ? 1 : 0; - int x_coarse_cb = x_coarse - parity_coarse*arg.in.VolumeCB(); + int parity_coarse = (x_coarse >= arg.in[src_idx].VolumeCB()) ? 1 : 0; + int x_coarse_cb = x_coarse - parity_coarse * arg.in[src_idx].VolumeCB(); #pragma unroll for (int s=0; s - __device__ __host__ inline void rotateFineColor(const Arg &arg, const complex in[], int parity, int x_cb, int fine_color_block) + __device__ __host__ inline void rotateFineColor(const Arg &arg, const complex in[], int src_idx, int parity, int x_cb, int fine_color_block) { constexpr int fine_color_per_thread = fine_colors_per_thread(); const int spinor_parity = (arg.nParity == 2) ? parity : 0; - const int v_parity = (arg.V.Nparity() == 2) ? parity : 0; + const int v_parity = (arg.v.Nparity() == 2) ? parity : 0; constexpr int color_unroll = 2; @@ -82,15 +100,15 @@ namespace quda { #pragma unroll for (int j=0; j tmp[Arg::fineSpin*Arg::coarseColor]; - prolongate(tmp, arg, parity, x_cb); - rotateFineColor(arg, tmp, parity, x_cb, fine_color_block); + + prolongate(tmp, arg, src_idx, parity, x_cb); + rotateFineColor(arg, tmp, src_idx, parity, x_cb, fine_color_block); } }; diff --git a/include/transfer.h b/include/transfer.h index 87500f0fd8..01d3e6c50f 100644 --- a/include/transfer.h +++ b/include/transfer.h @@ -178,7 +178,7 @@ namespace quda { * @param out The resulting field on the fine lattice * @param in The input field on the coarse lattice */ - void P(ColorSpinorField &out, const ColorSpinorField &in) const; + void P(cvector_ref &out, cvector_ref &in) const; /** * Apply the restrictor @@ -313,11 +313,11 @@ namespace quda { @param[in] spin_map Spin blocking lookup table @param[in] parity of the output fine field (if single parity output field) */ - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); template - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); /** @@ -345,7 +345,7 @@ namespace quda { @param[in] spin_map Spin blocking lookup table @param[in] parity of the output fine field (if single parity output field) */ - void StaggeredProlongate(ColorSpinorField &out, const ColorSpinorField &in, const int *fine_to_coarse, + void StaggeredProlongate(cvector_ref &out, cvector_ref &in, const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); /** diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp index 008d73de26..0f05524289 100644 --- a/lib/prolongator.in.cpp +++ b/lib/prolongator.in.cpp @@ -7,10 +7,10 @@ namespace quda }; template - void Prolongate2(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate2(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *const *spin_map, int parity, IntList) { - if (in.Ncolor() == coarseColor) { + if (in[0].Ncolor() == coarseColor) { if constexpr (coarseColor >= fineColor) { Prolongate(out, in, v, fine_to_coarse, spin_map, parity); } else { @@ -20,16 +20,16 @@ namespace quda if constexpr (sizeof...(N) > 0) { Prolongate2(out, in, v, fine_to_coarse, spin_map, parity, IntList()); } else { - errorQuda("Coarse Nc = %d has not been instantiated", in.Ncolor()); + errorQuda("Coarse Nc = %d has not been instantiated", in[0].Ncolor()); } } } template - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *const *spin_map, int parity, IntList) { - if (out.Ncolor() == fineColor) { + if (out[0].Ncolor() == fineColor) { // clang-format off IntList<@QUDA_MULTIGRID_NVEC_LIST@> coarseColors; // clang-format on @@ -38,12 +38,12 @@ namespace quda if constexpr (sizeof...(N) > 0) { Prolongate(out, in, v, fine_to_coarse, spin_map, parity, IntList()); } else { - errorQuda("Fine Nc = %d has not been instantiated", out.Ncolor()); + errorQuda("Fine Nc = %d has not been instantiated", out[0].Ncolor()); } } } - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *const *spin_map, int parity) { if constexpr (is_enabled_multigrid()) { diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu index 43f6b2a018..8320ca5e8f 100644 --- a/lib/prolongator.in.cu +++ b/lib/prolongator.in.cu @@ -9,50 +9,59 @@ namespace quda { class ProlongateLaunch : public TunableKernel3D { using Arg = ProlongateArg; - ColorSpinorField &out; - const ColorSpinorField ∈ + cvector_ref &out; + cvector_ref ∈ const ColorSpinorField &V; const int *fine_to_coarse; int parity; QudaFieldLocation location; - unsigned int minThreads() const { return out.VolumeCB(); } // fine parity is the block y dimension + unsigned int minThreads() const { return out[0].VolumeCB(); } // fine parity is the block y dimension public: - ProlongateLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V, + ProlongateLaunch(cvector_ref &out, cvector_ref &in, const ColorSpinorField &V, const int *fine_to_coarse, int parity) - : TunableKernel3D(in, out.SiteSubset(), fineColor/fine_colors_per_thread()), out(out), in(in), V(V), - fine_to_coarse(fine_to_coarse), parity(parity), location(checkLocation(out, in, V)) + : TunableKernel3D(in[0], out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread()), + out(out), + in(in), + V(V), + fine_to_coarse(fine_to_coarse), + parity(parity), + location(checkLocation(out[0], in[0], V)) { strcat(vol, ","); - strcat(vol, out.VolString().c_str()); + strcat(vol, out[0].VolString().c_str()); strcat(aux, ","); - strcat(aux, out.AuxString().c_str()); + strcat(aux, out[0].AuxString().c_str()); apply(device::get_default_stream()); } - void apply(const qudaStream_t &stream) { - if (checkNative(out, in, V)) { + void apply(const qudaStream_t &stream) + { + if (checkNative(out[0], in[0], V)) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); launch(tp, stream, Arg(out, in, V, fine_to_coarse, parity)); } } - long long flops() const { return 8 * fineSpin * fineColor * coarseColor * out.SiteSubset()*(long long)out.VolumeCB(); } + long long flops() const + { + return out.size() * 8 * fineSpin * fineColor * coarseColor * out[0].SiteSubset() * out[0].VolumeCB(); + } long long bytes() const { - size_t v_bytes = V.Bytes() / (V.SiteSubset() == out.SiteSubset() ? 1 : 2); - return in.Bytes() + out.Bytes() + v_bytes + out.SiteSubset()*out.VolumeCB()*sizeof(int); + size_t v_bytes = V.Bytes() / (V.SiteSubset() == out[0].SiteSubset() ? 1 : 2); + return out.size() * (in[0].Bytes() + out[0].Bytes() + v_bytes + out[0].SiteSubset() * out[0].VolumeCB() * sizeof(int)); } }; template - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int * const * spin_map, int parity) { - if (in.Nspin() != 2) errorQuda("Coarse spin %d is not supported", in.Nspin()); + if (in[0].Nspin() != 2) errorQuda("Coarse spin %d is not supported", in[0].Nspin()); constexpr int coarseSpin = 2; // first check that the spin_map matches the spin_mapper @@ -69,7 +78,7 @@ namespace quda { } else { errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION); } - } else if (v.Precision() == in.Precision()) { + } else if (v.Precision() == in[0].Precision()) { ProlongateLaunch prolongator(out, in, v, fine_to_coarse, parity); } else { @@ -78,25 +87,25 @@ namespace quda { } template - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int * const * spin_map, int parity) { - if (!is_enabled_spin(out.Nspin())) errorQuda("nSpin %d has not been built", in.Nspin()); + if (!is_enabled_spin(out[0].Nspin())) errorQuda("nSpin %d has not been built", in[0].Nspin()); - if (out.Nspin() == 2) { + if (out[0].Nspin() == 2) { Prolongate(out, in, v, fine_to_coarse, spin_map, parity); } else if constexpr (fineColor == 3) { - if (out.Nspin() == 4) { + if (out[0].Nspin() == 4) { if constexpr (is_enabled_spin(4)) Prolongate(out, in, v, fine_to_coarse, spin_map, parity); - } else if (out.Nspin() == 1) { + } else if (out[0].Nspin() == 1) { if constexpr (is_enabled_spin(1)) Prolongate(out, in, v, fine_to_coarse, spin_map, parity); } else { - errorQuda("Unsupported nSpin %d", out.Nspin()); + errorQuda("Unsupported nSpin %d", out[0].Nspin()); } } else { - errorQuda("Unexpected spin %d and color %d combination", out.Nspin(), out.Ncolor()); + errorQuda("Unexpected spin %d and color %d combination", out[0].Nspin(), out[0].Ncolor()); } } @@ -104,11 +113,11 @@ namespace quda { constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@; template <> - void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Prolongate(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int * const * spin_map, int parity) { if constexpr (is_enabled_multigrid()) { - QudaPrecision precision = checkPrecision(out, in); + QudaPrecision precision = checkPrecision(out[0], in[0]); if (precision == QUDA_DOUBLE_PRECISION) { if constexpr (is_enabled_multigrid_double()) @@ -118,7 +127,7 @@ namespace quda { } else if (precision == QUDA_SINGLE_PRECISION) { Prolongate(out, in, v, fine_to_coarse, spin_map, parity); } else { - errorQuda("Unsupported precision %d", out.Precision()); + errorQuda("Unsupported precision %d", out[0].Precision()); } } else { errorQuda("Multigrid has not been built"); diff --git a/lib/staggered_prolong_restrict.cu b/lib/staggered_prolong_restrict.cu index 39aa07365c..a8ae5ad604 100644 --- a/lib/staggered_prolong_restrict.cu +++ b/lib/staggered_prolong_restrict.cu @@ -122,4 +122,12 @@ namespace quda { void StaggeredRestrict(ColorSpinorField &, const ColorSpinorField &, const int *, const int * const *, int) { errorQuda("Staggered multigrid has not been build"); } #endif + void StaggeredProlongate(cvector_ref &out, cvector_ref &in, + const int *fine_to_coarse, const int * const * spin_map, int parity) + { + for (auto i = 0u; i < out.size(); i++) { + StaggeredProlongateRestrict(out[i], in[i], fine_to_coarse, spin_map, parity); + } + } + } // end namespace quda diff --git a/lib/transfer.cpp b/lib/transfer.cpp index 12821c4618..dab93f2304 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -356,60 +356,80 @@ namespace quda { } // apply the prolongator - void Transfer::P(ColorSpinorField &out, const ColorSpinorField &in) const { + void Transfer::P(cvector_ref &out, cvector_ref &in) const { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - ColorSpinorField *input = const_cast(&in); - ColorSpinorField *output = &out; initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; if (transfer_type == QUDA_TRANSFER_COARSE_KD) { - StaggeredProlongate(*output, *input, fine_to_coarse, spin_map, parity); + StaggeredProlongate(out, in, fine_to_coarse, spin_map, parity); } else if (transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) { - if (in.SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors"); - - if (output->VolumeCB() != input->VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes"); + if (in[0].SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors"); + if (out[0].VolumeCB() != in[0].VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes"); // the optimized KD op acts on fine spinors - if (out.SiteSubset() == QUDA_PARITY_SITE_SUBSET) { - *output = input->Even(); + if (out[0].SiteSubset() == QUDA_PARITY_SITE_SUBSET) { + for (auto i = 0u; i < out.size(); i++) out[i] = in[i].Even(); } else { - *output = *input; + for (auto i = 0u; i < out.size(); i++) out[i] = in[i]; } } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) { + std::vector input(in.size()); + std::vector output(out.size()); const ColorSpinorField *V = use_gpu ? V_d : V_h; + // the below are borked for m-rhs since we have only a single tmp available if (use_gpu) { - if (in.Location() == QUDA_CPU_FIELD_LOCATION) input = coarse_tmp_d; - if (out.Location() == QUDA_CPU_FIELD_LOCATION || out.GammaBasis() != V->GammaBasis()) - output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even(); + + // set input fields + if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) { + for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d->create_alias(); + } else { + for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); + } + + // set output fields + if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) { + for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d->create_alias() : fine_tmp_d->Even().create_alias(); + } else { + for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); + } if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); } else { - if (out.Location() == QUDA_CUDA_FIELD_LOCATION) - output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even(); + + // set input fields + for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); + + // set output fields + if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) { + for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h->create_alias() : fine_tmp_h->Even().create_alias(); + } else { + for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); + } + } - *input = in; // copy result to input field (aliasing handled automatically) + for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not? - if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out.SiteSubset() == QUDA_FULL_SITE_SUBSET) + if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) errorQuda("Cannot prolongate to a full field since only have single parity null-space components"); - if ((V->Nspin() != 1) && ((output->GammaBasis() != V->GammaBasis()) || (input->GammaBasis() != V->GammaBasis()))) { + if ((V->Nspin() != 1) && ((output[0].GammaBasis() != V->GammaBasis()) || (input[0].GammaBasis() != V->GammaBasis()))) { errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d", - output->GammaBasis(), in.GammaBasis(), V->GammaBasis()); + output[0].GammaBasis(), in[0].GammaBasis(), V->GammaBasis()); } - Prolongate(*output, *input, *V, fine_to_coarse, spin_map, parity); + Prolongate(output, input, *V, fine_to_coarse, spin_map, parity); + + for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically) } else { errorQuda("Invalid transfer type in prolongate"); } - out = *output; // copy result to out field (aliasing handled automatically) - getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } From 3fa6e95d431722eda0bc8830dfce6bfa5995367d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 29 Jan 2024 16:51:57 -0800 Subject: [PATCH 02/21] Create new functions ColorSpinorField::create_coarse and ColorSpinorField::create_fine, these will eventually replace ColorSpinorField::CreateCoarse and ColorSpinorField::CreateFine --- include/color_spinor_field.h | 28 ++++++++++++++ lib/color_spinor_field.cpp | 72 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index f6fc42d194..923678dbaa 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -819,6 +819,34 @@ namespace quda QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION, QudaMemoryType mem_type = QUDA_MEMORY_INVALID); + /** + @brief Create a coarse color-spinor field, using this field to set the meta data + @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions + @param[in] spinlockSize Geometric block size that defines the coarse spin dimension + @param[in] Nvec Number of coarse color degrees of freedom per grid point + @param[in] precision Optionally set the precision of the fine field + @param[in] location Optionally set the location of the coarse field + @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory) + */ + ColorSpinorField create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec, + QudaPrecision precision = QUDA_INVALID_PRECISION, + QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION, + QudaMemoryType mem_Type = QUDA_MEMORY_INVALID); + + /** + @brief Create a fine color-spinor field, using this field to set the meta data + @param[in] geoBlockSize Geometric block size that defines the fine grid dimensions + @param[in] spinlockSize Geometric block size that defines the fine spin dimension + @param[in] Nvec Number of fine color degrees of freedom per grid point + @param[in] precision Optionally set the precision of the fine field + @param[in] location Optionally set the location of the fine field + @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory) + */ + ColorSpinorField create_fine(const int *geoblockSize, int spinBlockSize, int Nvec, + QudaPrecision precision = QUDA_INVALID_PRECISION, + QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION, + QudaMemoryType mem_type = QUDA_MEMORY_INVALID); + /** @brief Backs up the ColorSpinorField */ diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index d2a6c93d8d..e8fbfd56d5 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -923,6 +923,78 @@ namespace quda return new ColorSpinorField(fineParam); } + ColorSpinorField ColorSpinorField::create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec, + QudaPrecision new_precision, QudaFieldLocation new_location, + QudaMemoryType new_mem_type) + { + ColorSpinorParam coarseParam(*this); + for (int d = 0; d < nDim; d++) coarseParam.x[d] = x[d] / geoBlockSize[d]; + + int geoBlockVolume = 1; + for (int d = 0; d < nDim; d++) { geoBlockVolume *= geoBlockSize[d]; } + + // Detect if the "coarse" op is the Kahler-Dirac op or something else + // that still acts on a fine staggered ColorSpinorField + if (geoBlockVolume == 1 && Nvec == nColor && nSpin == 1) { + coarseParam.nSpin = nSpin; + coarseParam.nColor = nColor; + } else { + coarseParam.nSpin = (nSpin == 1) ? 2 : (nSpin / spinBlockSize); // coarsening staggered check + coarseParam.nColor = Nvec; + } + + coarseParam.siteSubset = QUDA_FULL_SITE_SUBSET; // coarse grid is always full + coarseParam.create = QUDA_ZERO_FIELD_CREATE; + + // if new precision is not set, use this->precision + new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision; + + // if new location is not set, use this->location + new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location; + + coarseParam.fieldOrder = (new_location == QUDA_CUDA_FIELD_LOCATION) ? + colorspinor::getNative(new_precision, coarseParam.nSpin) : + QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + + coarseParam.setPrecision(new_precision); + + // set where we allocate the field + coarseParam.mem_type = new_mem_type; + + return ColorSpinorField(coarseParam); + } + + ColorSpinorField ColorSpinorField::create_fine(const int *geoBlockSize, int spinBlockSize, int Nvec, + QudaPrecision new_precision, QudaFieldLocation new_location, + QudaMemoryType new_mem_type) + { + ColorSpinorParam fineParam(*this); + for (int d = 0; d < nDim; d++) fineParam.x[d] = x[d] * geoBlockSize[d]; + fineParam.nSpin = nSpin * spinBlockSize; + fineParam.nColor = Nvec; + fineParam.siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME fine grid is always full + fineParam.create = QUDA_ZERO_FIELD_CREATE; + + // if new precision is not set, use this->precision + new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision; + + // if new location is not set, use this->location + new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location; + + // for GPU fields, always use native ordering to ensure coalescing + if (new_location == QUDA_CUDA_FIELD_LOCATION) { + fineParam.setPrecision(new_precision, new_precision, true); + } else { + fineParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + fineParam.setPrecision(new_precision); + } + + // set where we allocate the field + fineParam.mem_type = new_mem_type; + + return ColorSpinorField(fineParam); + } + // legacy CPU static ghost destructor void ColorSpinorField::freeGhostBuffer(void) { From 75646d3888a90c782e2a36fa83fcdbfcd53f8148 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 29 Jan 2024 16:54:58 -0800 Subject: [PATCH 03/21] Upgrade Transfer temporary allocation to support mrhs --- include/transfer.h | 32 ++++++++++++----------- lib/transfer.cpp | 64 +++++++++++++++++----------------------------- 2 files changed, 41 insertions(+), 55 deletions(-) diff --git a/include/transfer.h b/include/transfer.h index 01d3e6c50f..d8f9485d18 100644 --- a/include/transfer.h +++ b/include/transfer.h @@ -47,47 +47,47 @@ namespace quda { const QudaPrecision null_precision; /** CPU copy of the block-normalized null-space components that define the prolongator */ - mutable ColorSpinorField *V_h; + mutable ColorSpinorField *V_h = nullptr; /** GPU copy of the block-normalized null-space components that define the prolongator */ - mutable ColorSpinorField *V_d; + mutable ColorSpinorField *V_d = nullptr; /** A CPU temporary field with fine geometry and fine color we use for changing gamma basis */ - mutable ColorSpinorField *fine_tmp_h; + mutable std::vector fine_tmp_h; /** A GPU temporary field with fine geometry and fine color we use for changing gamma basis */ - mutable ColorSpinorField *fine_tmp_d; + mutable std::vector fine_tmp_d; /** A CPU temporary field with coarse geometry and coarse color */ - mutable ColorSpinorField *coarse_tmp_h; + mutable std::vector coarse_tmp_h; /** A GPU temporary field with coarse geometry and coarse color we use for CPU input / output */ - mutable ColorSpinorField *coarse_tmp_d; + mutable std::vector coarse_tmp_d; /** The geometrical coase grid blocking */ - int *geo_bs; + int *geo_bs = nullptr; /** The mapping onto coarse sites from fine sites. This has length equal to the fine-grid volume, and is sorted into lexicographical fine-grid order, with each value corresponding to a coarse-grid offset. (CPU) */ - mutable int *fine_to_coarse_h; + mutable int *fine_to_coarse_h = nullptr; /** The mapping onto fine sites from coarse sites. This has length equal to the fine-grid volume, and is sorted into lexicographical block order, with each value corresponding to a fine-grid offset. (CPU) */ - mutable int *coarse_to_fine_h; + mutable int *coarse_to_fine_h = nullptr; /** The mapping onto coarse sites from fine sites. This has length equal to the fine-grid volume, and is sorted into lexicographical fine-grid order, with each value corresponding to a coarse-grid offset. (GPU) */ - mutable int *fine_to_coarse_d; + mutable int *fine_to_coarse_d = nullptr; /** The mapping onto fine sites from coarse sites. This has length equal to the fine-grid volume, and is sorted into lexicographical block order, with each value corresponding to a fine-grid offset. (GPU) */ - mutable int *coarse_to_fine_d; + mutable int *coarse_to_fine_d = nullptr; /** The spin blocking. Defined as zero when the fine operator is staggered. */ int spin_bs; @@ -105,10 +105,10 @@ namespace quda { QudaParity parity; /** Whether the GPU transfer operator has been constructed */ - mutable bool enable_gpu; + mutable bool enable_gpu = false; /** Whether the CPU transfer operator has been constructed */ - mutable bool enable_cpu; + mutable bool enable_cpu = false; /** Whether to apply the transfer operaton the GPU (requires enable_gpu=true in the constructor) */ @@ -127,8 +127,9 @@ namespace quda { /** * @brief Allocate temporaries used when applying transfer operators * @param[in] location Where to allocate the temporaries + * @param[in] n_src Number of temporaries to allocate */ - void createTmp(QudaFieldLocation location) const; + void createTmp(QudaFieldLocation location, size_t n_src) const; /** * @brief Creates the map between fine and coarse grids @@ -145,8 +146,9 @@ namespace quda { /** * @brief Lazy allocation of the transfer operator in a given location * @param[in] location Where to allocate the temporaries + * @param[in] n_src Number of temporaries to allocate */ - void initializeLazy(QudaFieldLocation location) const; + void initializeLazy(QudaFieldLocation location, size_t n_src) const; public: /** diff --git a/lib/transfer.cpp b/lib/transfer.cpp index dab93f2304..3ae1a415bd 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -26,24 +26,11 @@ namespace quda { NblockOrtho(n_block_ortho), blockOrthoTwoPass(block_ortho_two_pass), null_precision(null_precision), - V_h(nullptr), - V_d(nullptr), - fine_tmp_h(nullptr), - fine_tmp_d(nullptr), - coarse_tmp_h(nullptr), - coarse_tmp_d(nullptr), - geo_bs(nullptr), - fine_to_coarse_h(nullptr), - coarse_to_fine_h(nullptr), - fine_to_coarse_d(nullptr), - coarse_to_fine_d(nullptr), spin_bs(spin_bs), spin_map(0), nspin_fine(B[0]->Nspin()), site_subset(QUDA_FULL_SITE_SUBSET), parity(QUDA_INVALID_PARITY), - enable_gpu(false), - enable_cpu(false), use_gpu(true), transfer_type(transfer_type) { @@ -109,7 +96,7 @@ namespace quda { } createV(B[0]->Location()); // allocate V field - createTmp(QUDA_CPU_FIELD_LOCATION); // allocate temporaries + createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation) // allocate and compute the fine-to-coarse and coarse-to-fine site maps fine_to_coarse_h = static_cast(pool_pinned_malloc(B[0]->Volume()*sizeof(int))); @@ -173,7 +160,7 @@ namespace quda { postTrace(); } - void Transfer::createTmp(QudaFieldLocation location) const + void Transfer::createTmp(QudaFieldLocation location, size_t n_src) const { // The CPU temporaries are needed for creating geometry mappings. if ((transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD @@ -182,6 +169,9 @@ namespace quda { return; } + if (location == QUDA_CUDA_FIELD_LOCATION && fine_tmp_d.size() == n_src && coarse_tmp_d.size() == n_src) return; + if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return; + postTrace(); ColorSpinorParam param(*B[0]); param.create = QUDA_NULL_FIELD_CREATE; @@ -191,29 +181,30 @@ namespace quda { if (param.Precision() < QUDA_SINGLE_PRECISION) param.setPrecision(QUDA_SINGLE_PRECISION); if (location == QUDA_CUDA_FIELD_LOCATION) { - if (fine_tmp_d && coarse_tmp_d) return; - fine_tmp_d = new ColorSpinorField(param); - coarse_tmp_d = fine_tmp_d->CreateCoarse(geo_bs, spin_bs, Nvec); + resize(fine_tmp_d, n_src, param); + coarse_tmp_d.push_back(fine_tmp_d[0].create_coarse(geo_bs, spin_bs, Nvec)); + resize(coarse_tmp_d, n_src, ColorSpinorParam(coarse_tmp_d[0])); } else { - fine_tmp_h = new ColorSpinorField(param); - coarse_tmp_h = fine_tmp_h->CreateCoarse(geo_bs, spin_bs, Nvec); + resize(fine_tmp_h, n_src, param); + coarse_tmp_h.push_back(fine_tmp_h[0].create_coarse(geo_bs, spin_bs, Nvec)); + resize(coarse_tmp_h, n_src, ColorSpinorParam(coarse_tmp_h[0])); } postTrace(); } - void Transfer::initializeLazy(QudaFieldLocation location) const + void Transfer::initializeLazy(QudaFieldLocation location, size_t n_src) const { if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized"); // delayed allocating this temporary until we need it - if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION); + //if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src); + createTmp(location, n_src); switch (location) { case QUDA_CUDA_FIELD_LOCATION: if (enable_gpu) return; createV(location); if (transfer_type == QUDA_TRANSFER_AGGREGATE) *V_d = *V_h; - createTmp(location); fine_to_coarse_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); coarse_to_fine_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice); @@ -270,12 +261,6 @@ namespace quda { if (V_h) delete V_h; if (V_d) delete V_d; - if (fine_tmp_h) delete fine_tmp_h; - if (fine_tmp_d) delete fine_tmp_d; - - if (coarse_tmp_h) delete coarse_tmp_h; - if (coarse_tmp_d) delete coarse_tmp_d; - if (geo_bs) delete []geo_bs; } @@ -304,8 +289,8 @@ namespace quda { int x[QUDA_MAX_DIM]; - ColorSpinorField &fine(*fine_tmp_h); - ColorSpinorField &coarse(*coarse_tmp_h); + ColorSpinorField &fine(fine_tmp_h[0]); + ColorSpinorField &coarse(coarse_tmp_h[0]); // compute the coarse grid point for every site (assuming parity ordering currently) for (size_t i = 0; i < fine.Volume(); i++) { @@ -359,7 +344,7 @@ namespace quda { void Transfer::P(cvector_ref &out, cvector_ref &in) const { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION); + initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size()); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; if (transfer_type == QUDA_TRANSFER_COARSE_KD) { @@ -382,19 +367,18 @@ namespace quda { const ColorSpinorField *V = use_gpu ? V_d : V_h; - // the below are borked for m-rhs since we have only a single tmp available if (use_gpu) { // set input fields if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) { - for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d->create_alias(); + for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d[i].create_alias(); } else { for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); } // set output fields if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d->create_alias() : fine_tmp_d->Even().create_alias(); + for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -406,7 +390,7 @@ namespace quda { // set output fields if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h->create_alias() : fine_tmp_h->Even().create_alias(); + for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -440,7 +424,7 @@ namespace quda { ColorSpinorField *input = &const_cast(in); ColorSpinorField *output = &out; - initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION); + initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h; @@ -464,13 +448,13 @@ namespace quda { const ColorSpinorField *V = use_gpu ? V_d : V_h; if (use_gpu) { - if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = coarse_tmp_d; + if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = &coarse_tmp_d[0]; if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis()) - input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even(); + input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_d[0] : &fine_tmp_d[0].Even(); if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); } else { if (in.Location() == QUDA_CUDA_FIELD_LOCATION) - input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even(); + input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_h[0] : &fine_tmp_h[0].Even(); } *input = in; From 7f7fbdd381c4795adfa4e7acc68f3255fffb322c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 29 Jan 2024 19:42:17 -0800 Subject: [PATCH 04/21] Restrictor now multi-RHS aware --- include/kernels/restrictor.cuh | 61 +++++++++++++++++-------- include/transfer.h | 12 ++--- include/tunable_block_reduction.h | 2 +- lib/restrictor.in.cpp | 20 ++++---- lib/restrictor.in.cu | 65 +++++++++++++------------- lib/staggered_prolong_restrict.cu | 8 ++++ lib/transfer.cpp | 76 ++++++++++++++++++++----------- 7 files changed, 149 insertions(+), 95 deletions(-) diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index c121f86dca..9da7633071 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -29,9 +29,17 @@ namespace quda { static constexpr int coarseSpin = coarseSpin_; static constexpr int coarseColor = coarseColor_; - FieldOrderCB(coarseSpin)> out; - const FieldOrderCB(fineSpin)> in; - const FieldOrderCB(fineSpin), vFloat> V; + // disable ghost to reduce arg size + using F = FieldOrderCB(fineSpin), Float, Float, true>; + using C = FieldOrderCB(coarseSpin), Float, Float, true>; + using V = FieldOrderCB(fineSpin), vFloat>; + + static constexpr unsigned int max_n_src = 64; + const int_fastdiv n_src; + + C out[max_n_src]; + F in[max_n_src]; + const V v; const int aggregate_size; // number of sites that form a single aggregate const int_fastdiv aggregate_size_cb; // number of checkerboard sites that form a single aggregate const int *fine_to_coarse; @@ -51,26 +59,37 @@ namespace quda { dim3 grid_dim; dim3 block_dim; - RestrictArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V, + RestrictArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *coarse_to_fine, int parity) : - kernel_param(dim3(in.Volume()/out.Volume(), 1, coarseColor/coarse_colors_per_thread())), - out(out), in(in), V(V), - aggregate_size(in.Volume()/out.Volume()), - aggregate_size_cb(in.VolumeCB()/out.Volume()), - fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine), - spin_map(), parity(parity), nParity(in.SiteSubset()), swizzle_factor(1) - { } + kernel_param(dim3(in[0].Volume() / out[0].Volume(), 1, out.size() * (coarseColor / coarse_colors_per_thread()))), + n_src(out.size()), + v(v), + aggregate_size(in[0].Volume() / out[0].Volume()), + aggregate_size_cb(in[0].VolumeCB() / out[0].Volume()), + fine_to_coarse(fine_to_coarse), + coarse_to_fine(coarse_to_fine), + spin_map(), + parity(parity), + nParity(in[0].SiteSubset()), + swizzle_factor(1) + { + if (out.size() > max_n_src) errorQuda("vector set size %lu greater than max size %d", out.size(), max_n_src); + for (auto i = 0u; i < out.size(); i++) { + this->out[i] = out[i]; + this->in[i] = in[i]; + } + } }; /** Rotates from the fine-color basis into the coarse-color basis. */ template - __device__ __host__ inline void rotateCoarseColor(Out &out, const Arg &arg, int parity, int x_cb, int coarse_color_block) + __device__ __host__ inline void rotateCoarseColor(Out &out, const Arg &arg, int src_idx, int parity, int x_cb, int coarse_color_block) { constexpr int coarse_color_per_thread = coarse_colors_per_thread(); const int spinor_parity = (arg.nParity == 2) ? parity : 0; - const int v_parity = (arg.V.Nparity() == 2) ? parity : 0; + const int v_parity = (arg.v.Nparity() == 2) ? parity : 0; #pragma unroll for (int s=0; s, Arg::fineSpin * coarse_color_per_thread> tmp{0}; - rotateCoarseColor(tmp, arg, parity, x_fine_cb, coarse_color_block); + rotateCoarseColor(tmp, arg, src_idx, parity, x_fine_cb, coarse_color_block); // perform any local spin coarsening #pragma unroll @@ -155,15 +176,15 @@ namespace quda { reduced = BlockReduce(thread.z).Sum(reduced); if (target::thread_idx().x == 0) { - const int parity_coarse = x_coarse >= arg.out.VolumeCB() ? 1 : 0; - const int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB(); + const int parity_coarse = x_coarse >= arg.out[src_idx].VolumeCB() ? 1 : 0; + const int x_coarse_cb = x_coarse - parity_coarse*arg.out[src_idx].VolumeCB(); #pragma unroll for (int s = 0; s < Arg::coarseSpin; s++) { #pragma unroll for (int coarse_color_local=0; coarse_color_local &out, cvector_ref &in) const; /** * @brief The precision of the packed null-space vectors @@ -332,12 +332,12 @@ namespace quda { @param[in] spin_map Spin blocking lookup table @param[in] parity of the input fine field (if single parity input field) */ - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse, - const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, + const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); template - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse, - const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, + const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); /** @brief Apply the unitary "prolongation" operator for Kahler-Dirac preconditioning @@ -358,7 +358,7 @@ namespace quda { @param[in] spin_map Spin blocking lookup table @param[in] parity of the output fine field (if single parity output field) */ - void StaggeredRestrict(ColorSpinorField &out, const ColorSpinorField &in, const int *fine_to_coarse, + void StaggeredRestrict(cvector_ref &out, cvector_ref &in, const int *fine_to_coarse, const int *const *spin_map, int parity = QUDA_INVALID_PARITY); } // namespace quda diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h index fcd73e5a92..bf4529dbf1 100644 --- a/include/tunable_block_reduction.h +++ b/include/tunable_block_reduction.h @@ -12,7 +12,7 @@ namespace quda @brief This derived tunable class is for block reduction kernels, and partners the BlockReduction2D kernel. Each reduction block is mapped to x grid, with each block mapping to a thread block. - Each thread block is potentially two dimensional, with the y + Each thread block is potentially two dimensional, with the z dimension can be mapped to vector of computations, similar to TunableKernel2D. Due to the exact mapping of reduction block to the thread blocks, no block-size tuning is performed in the x diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp index ac982fde69..2f6d2697e0 100644 --- a/lib/restrictor.in.cpp +++ b/lib/restrictor.in.cpp @@ -7,10 +7,10 @@ namespace quda }; template - void Restrict2(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse, - const int *coarse_to_fine, const int *const *spin_map, int parity, IntList) + void Restrict2(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, + const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity, IntList) { - if (out.Ncolor() == coarseColor) { + if (out[0].Ncolor() == coarseColor) { if constexpr (coarseColor >= fineColor) { Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity); } else { @@ -20,16 +20,16 @@ namespace quda if constexpr (sizeof...(N) > 0) { Restrict2(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, IntList()); } else { - errorQuda("Coarse Nc = %d has not been instantiated", out.Ncolor()); + errorQuda("Coarse Nc = %d has not been instantiated", out[0].Ncolor()); } } } template - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse, - const int *coarse_to_fine, const int *const *spin_map, int parity, IntList) + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, + const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity, IntList) { - if (in.Ncolor() == fineColor) { + if (in[0].Ncolor() == fineColor) { // clang-format off IntList<@QUDA_MULTIGRID_NVEC_LIST@> coarseColors; // clang-format on @@ -38,13 +38,13 @@ namespace quda if constexpr (sizeof...(N) > 0) { Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, IntList()); } else { - errorQuda("Fine Nc = %d has not been instantiated", in.Ncolor()); + errorQuda("Fine Nc = %d has not been instantiated", in[0].Ncolor()); } } } - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, const int *fine_to_coarse, - const int *coarse_to_fine, const int *const *spin_map, int parity) + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, + const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity) { if constexpr (is_enabled_multigrid()) { // clang-format off diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index d1c2af1d4d..27a1bd5c5d 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -16,8 +16,8 @@ namespace quda { template class RestrictLaunch : public TunableBlock2D { using Arg = RestrictArg; - ColorSpinorField &out; - const ColorSpinorField ∈ + cvector_ref &out; + cvector_ref ∈ const ColorSpinorField &v; const int *fine_to_coarse; const int *coarse_to_fine; @@ -25,19 +25,19 @@ namespace quda { bool tuneSharedBytes() const { return false; } bool tuneAuxDim() const { return true; } - unsigned int minThreads() const { return in.Volume(); } // fine parity is the block y dimension + unsigned int minThreads() const { return in[0].Volume(); } // fine parity is the block y dimension public: - RestrictLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + RestrictLaunch(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *coarse_to_fine, int parity) : - TunableBlock2D(in, false, coarseColor / coarse_colors_per_thread(), max_z_block()), + TunableBlock2D(in[0], false, out.size() * (coarseColor / coarse_colors_per_thread()), max_z_block()), out(out), in(in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine), parity(parity) { strcat(vol, ","); - strcat(vol, out.VolString().c_str()); + strcat(vol, out[0].VolString().c_str()); strcat(aux, ","); - strcat(aux, out.AuxString().c_str()); + strcat(aux, out[0].AuxString().c_str()); apply(device::get_default_stream()); } @@ -45,7 +45,7 @@ namespace quda { void apply(const qudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - if (checkNative(out, in, v)) { + if (checkNative(out[0], in[0], v)) { Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity); arg.swizzle_factor = tp.aux.x; launch(tp, stream, arg); @@ -75,42 +75,45 @@ namespace quda { */ unsigned int blockMapper() const { - auto aggregate_size = in.Volume() / out.Volume(); + auto aggregate_size = in[0].Volume() / out[0].Volume(); auto max_block = 128u; for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size < b) return b; return max_block; } - void initTuneParam(TuneParam ¶m) const { + void initTuneParam(TuneParam ¶m) const + { TunableBlock2D::initTuneParam(param); param.block.x = blockMapper(); - param.grid.x = out.Volume(); + param.grid.x = out[0].Volume(); param.shared_bytes = 0; param.aux.x = 2; // swizzle factor } - void defaultTuneParam(TuneParam ¶m) const { + void defaultTuneParam(TuneParam ¶m) const + { TunableBlock2D::defaultTuneParam(param); param.block.x = blockMapper(); - param.grid.x = out.Volume(); + param.grid.x = out[0].Volume(); param.shared_bytes = 0; param.aux.x = 2; // swizzle factor } - long long flops() const { return 8 * fineSpin * fineColor * coarseColor * in.SiteSubset()*(long long)in.VolumeCB(); } + long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset()*in[0].VolumeCB(); } - long long bytes() const { - size_t v_bytes = v.Bytes() / (v.SiteSubset() == in.SiteSubset() ? 1 : 2); - return in.Bytes() + out.Bytes() + v_bytes + in.SiteSubset()*in.VolumeCB()*sizeof(int); + long long bytes() const + { + size_t v_bytes = v.Bytes() / (v.SiteSubset() == in[0].SiteSubset() ? 1 : 2); + return out.size() * (in[0].Bytes() + out[0].Bytes() + v_bytes + in[0].SiteSubset() * in[0].VolumeCB() * sizeof(int)); } }; template - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) { - if (out.Nspin() != 2) errorQuda("Unsupported nSpin %d", out.Nspin()); + if (out[0].Nspin() != 2) errorQuda("Unsupported nSpin %d", out[0].Nspin()); constexpr int coarseSpin = 2; // first check that the spin_map matches the spin_mapper @@ -126,7 +129,7 @@ namespace quda { } else { errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION); } - } else if (v.Precision() == in.Precision()) { + } else if (v.Precision() == in[0].Precision()) { RestrictLaunch restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity); } else { @@ -135,25 +138,25 @@ namespace quda { } template - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) { - if (!is_enabled_spin(in.Nspin())) errorQuda("nSpin %d has not been built", in.Nspin()); + if (!is_enabled_spin(in[0].Nspin())) errorQuda("nSpin %d has not been built", in[0].Nspin()); - if (in.Nspin() == 2) { + if (in[0].Nspin() == 2) { Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity); } else if constexpr (fineColor == 3) { - if (in.Nspin() == 4) { + if (in[0].Nspin() == 4) { if constexpr (is_enabled_spin(4)) Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity); - } else if (in.Nspin() == 1) { + } else if (in[0].Nspin() == 1) { if constexpr (is_enabled_spin(1)) Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity); } else { - errorQuda("Unexpected nSpin = %d", in.Nspin()); + errorQuda("Unexpected nSpin = %d", in[0].Nspin()); } } else { - errorQuda("Unexpected spin %d and color %d combination", in.Nspin(), in.Ncolor()); + errorQuda("Unexpected spin %d and color %d combination", in[0].Nspin(), in[0].Ncolor()); } } @@ -161,11 +164,11 @@ namespace quda { constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@; template <> - void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, + void Restrict(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *fine_to_coarse, const int *coarse_to_fine, const int * const * spin_map, int parity) { - checkLocation(out, in, v); - QudaPrecision precision = checkPrecision(out, in); + checkLocation(out[0], in[0], v); + QudaPrecision precision = checkPrecision(out[0], in[0]); if constexpr (is_enabled_multigrid()) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -175,7 +178,7 @@ namespace quda { } else if (precision == QUDA_SINGLE_PRECISION) { Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity); } else { - errorQuda("Unsupported precision %d", out.Precision()); + errorQuda("Unsupported precision %d", out[0].Precision()); } } else { errorQuda("Multigrid has not been built"); diff --git a/lib/staggered_prolong_restrict.cu b/lib/staggered_prolong_restrict.cu index a8ae5ad604..0303817456 100644 --- a/lib/staggered_prolong_restrict.cu +++ b/lib/staggered_prolong_restrict.cu @@ -122,6 +122,14 @@ namespace quda { void StaggeredRestrict(ColorSpinorField &, const ColorSpinorField &, const int *, const int * const *, int) { errorQuda("Staggered multigrid has not been build"); } #endif + void StaggeredRestrict(cvector_ref &out, cvector_ref &in, + const int *fine_to_coarse, const int * const * spin_map, int parity) + { + for (auto i = 0u; i < out.size(); i++) { + StaggeredProlongateRestrict(out[i], in[i], fine_to_coarse, spin_map, parity); + } + } + void StaggeredProlongate(cvector_ref &out, cvector_ref &in, const int *fine_to_coarse, const int * const * spin_map, int parity) { diff --git a/lib/transfer.cpp b/lib/transfer.cpp index 3ae1a415bd..895055219c 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -362,9 +362,9 @@ namespace quda { } } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) { + std::vector input(in.size()); std::vector output(out.size()); - const ColorSpinorField *V = use_gpu ? V_d : V_h; if (use_gpu) { @@ -378,7 +378,7 @@ namespace quda { // set output fields if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); + for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -390,7 +390,7 @@ namespace quda { // set output fields if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); + for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -418,64 +418,86 @@ namespace quda { } // apply the restrictor - void Transfer::R(ColorSpinorField &out, const ColorSpinorField &in) const + void Transfer::R(cvector_ref &out, cvector_ref &in) const { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - ColorSpinorField *input = &const_cast(in); - ColorSpinorField *output = &out; initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h; if (transfer_type == QUDA_TRANSFER_COARSE_KD) { - StaggeredRestrict(*output, *input, fine_to_coarse, spin_map, parity); + StaggeredRestrict(out, in, fine_to_coarse, spin_map, parity); } else if (transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) { - if (out.SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors"); - - if (output->VolumeCB() != input->VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes"); + if (out[0].SiteSubset() != QUDA_FULL_SITE_SUBSET) errorQuda("Optimized KD op only supports full-parity spinors"); + if (out[0].VolumeCB() != in[0].VolumeCB()) errorQuda("Optimized KD transfer is only between equal volumes"); // the optimized KD op acts on fine spinors - if (in.SiteSubset() == QUDA_PARITY_SITE_SUBSET) { - output->Even() = *input; - blas::zero(output->Odd()); + if (in[0].SiteSubset() == QUDA_PARITY_SITE_SUBSET) { + for (auto i = 0u; i < out.size(); i++) out[i].Even() = in[i]; + for (auto i = 0u; i < out.size(); i++) blas::zero(out[i].Odd()); } else { - *output = *input; + for (auto i = 0u; i < out.size(); i++) out[i] = in[i]; } + } else if (transfer_type == QUDA_TRANSFER_AGGREGATE) { + std::vector input(in.size()); + std::vector output(out.size()); const ColorSpinorField *V = use_gpu ? V_d : V_h; if (use_gpu) { - if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = &coarse_tmp_d[0]; - if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis()) - input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_d[0] : &fine_tmp_d[0].Even(); + + // set input fields + if (in[0].Location() == QUDA_CPU_FIELD_LOCATION || in[0].GammaBasis() != V->GammaBasis()) { + for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); + } else { + for (auto i = 0u; i < out.size(); i++) input[i] = const_cast(in[i]).create_alias(); + } + if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); + + // set output fields + if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) { + for (auto i = 0u; i < out.size(); i++) output[i] = coarse_tmp_d[i].create_alias(); + } else { + for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); + } if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); } else { - if (in.Location() == QUDA_CUDA_FIELD_LOCATION) - input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? &fine_tmp_h[0] : &fine_tmp_h[0].Even(); + + // set input fields + if (in[0].Location() == QUDA_CUDA_FIELD_LOCATION) { + for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); + } else { + for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); + } + + // set output fields + // set input fields + for (auto i = 0u; i < out.size(); i++) output[i] = const_cast(out[i]).create_alias(); + } - *input = in; + for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not? - if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in.SiteSubset() == QUDA_FULL_SITE_SUBSET) + if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) errorQuda("Cannot restrict a full field since only have single parity null-space components"); - if (V->Nspin() != 1 && (output->GammaBasis() != V->GammaBasis() || input->GammaBasis() != V->GammaBasis())) + if (V->Nspin() != 1 && (output[0].GammaBasis() != V->GammaBasis() || input[0].GammaBasis() != V->GammaBasis())) errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d", - out.GammaBasis(), input->GammaBasis(), V->GammaBasis()); + out[0].GammaBasis(), input[0].GammaBasis(), V->GammaBasis()); - Restrict(*output, *input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity); + Restrict(output, input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity); + + for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically) } else { errorQuda("Invalid transfer type in restrict"); } - out = *output; // copy result to out field (aliasing handled automatically) - // only need to synchronize if we're transferring from GPU to CPU - if (out.Location() == QUDA_CPU_FIELD_LOCATION && in.Location() == QUDA_CUDA_FIELD_LOCATION) + if (out[0].Location() == QUDA_CPU_FIELD_LOCATION && in[0].Location() == QUDA_CUDA_FIELD_LOCATION) qudaDeviceSynchronize(); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); From 7122947b69b88fdf77a4723b63cebabb177499bb Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 Jan 2024 09:00:57 -0800 Subject: [PATCH 05/21] Multigrid now stores its null space vectors using objects not pointers --- include/kernels/block_orthogonalize.cuh | 2 +- include/multigrid.h | 72 +++--- include/transfer.h | 12 +- lib/block_orthogonalize.in.cpp | 6 +- lib/block_orthogonalize.in.cu | 48 ++-- lib/interface_quda.cpp | 2 +- lib/multigrid.cpp | 331 +++++++++++------------- lib/transfer.cpp | 68 ++--- 8 files changed, 256 insertions(+), 285 deletions(-) diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh index d2d8c3cd6f..bec77273b7 100644 --- a/include/kernels/block_orthogonalize.cuh +++ b/include/kernels/block_orthogonalize.cuh @@ -69,7 +69,7 @@ namespace quda { nParity(meta.SiteSubset()), nBlockOrtho(n_block_ortho), fineVolumeCB(meta.VolumeCB()), - B{*B...}, + B{B...}, grid_dim(), block_dim() { diff --git a/include/multigrid.h b/include/multigrid.h index a3ec2ce491..16250e5bdc 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -112,7 +112,7 @@ namespace quda { MG *fine; /** The null space vectors */ - std::vector &B; + std::vector &B; /** Number of pre-smoothing applications to perform */ int nu_pre; @@ -172,7 +172,7 @@ namespace quda { /** This is top level instantiation done when we start creating the multigrid operator. */ - MGParam(QudaMultigridParam ¶m, std::vector &B, DiracMatrix *matResidual, + MGParam(QudaMultigridParam ¶m, std::vector &B, DiracMatrix *matResidual, DiracMatrix *matSmooth, DiracMatrix *matSmoothSloppy, int level = 0) : SolverParam(*(param.invert_param)), mg_global(param), @@ -208,7 +208,7 @@ namespace quda { omega = param.omega[level]; } - MGParam(const MGParam ¶m, std::vector &B, DiracMatrix *matResidual, DiracMatrix *matSmooth, + MGParam(const MGParam ¶m, std::vector &B, DiracMatrix *matResidual, DiracMatrix *matSmooth, DiracMatrix *matSmoothSloppy, int level = 0) : SolverParam(param), mg_global(param.mg_global), @@ -257,13 +257,15 @@ namespace quda { MGParam ¶m; /** This is the transfer operator that defines the prolongation and restriction operators */ - Transfer *transfer; + Transfer *transfer = nullptr; /** This tell to reset() if transfer needs to be rebuilt */ - bool resetTransfer; + bool resetTransfer = false; /** This is the smoother used */ - Solver *presmoother, *postsmoother; + Solver *presmoother = nullptr; + + Solver *postsmoother = nullptr; /** Prefix label used for printf at this level */ char prefix[128]; @@ -272,43 +274,43 @@ namespace quda { char coarse_prefix[128]; /** This is the next lower level */ - MG *coarse; + MG *coarse = nullptr; /** The coarse grid solver - this either points at "coarse" or a solver preconditioned by "coarse" */ - Solver *coarse_solver; + Solver *coarse_solver = nullptr; /** Storage for the parameter struct for the coarse grid */ - MGParam *param_coarse; + MGParam *param_coarse = nullptr; /** Storage for the parameter struct for the pre-smoother */ - SolverParam *param_presmooth; + SolverParam *param_presmooth = nullptr; /** Storage for the parameter struct for the post-smoother */ - SolverParam *param_postsmooth; + SolverParam *param_postsmooth = nullptr; /** Storage for the parameter struct for the coarse solver */ - SolverParam *param_coarse_solver; + SolverParam *param_coarse_solver = nullptr; /** The coarse-grid representation of the null space vectors */ - std::vector *B_coarse; + std::vector B_coarse; /** Residual vector */ - ColorSpinorField *r; + ColorSpinorField *r = nullptr; /** Projected source vector for preconditioned system, else just points to source */ - ColorSpinorField *b_tilde; + ColorSpinorField *b_tilde = nullptr; /** Coarse residual vector */ - ColorSpinorField *r_coarse; + ColorSpinorField *r_coarse = nullptr; /** Coarse solution vector */ - ColorSpinorField *x_coarse; + ColorSpinorField *x_coarse = nullptr; /** Coarse temporary vector */ - ColorSpinorField *tmp_coarse; + ColorSpinorField *tmp_coarse = nullptr; /** Sloppy coarse temporary vector */ - ColorSpinorField *tmp_coarse_sloppy; + ColorSpinorField *tmp_coarse_sloppy = nullptr; /** Kahler-Dirac Xinv */ std::shared_ptr xInvKD; @@ -317,34 +319,34 @@ namespace quda { std::shared_ptr xInvKD_sloppy; /** The fine operator used for computing inter-grid residuals */ - const Dirac *diracResidual; + const Dirac *diracResidual = nullptr; /** The fine operator used for doing smoothing */ - const Dirac *diracSmoother; + const Dirac *diracSmoother = nullptr; /** The fine operator used for doing sloppy smoothing */ - const Dirac *diracSmootherSloppy; + const Dirac *diracSmootherSloppy = nullptr; /** The coarse operator used for computing inter-grid residuals */ - Dirac *diracCoarseResidual; + Dirac *diracCoarseResidual = nullptr; /** The coarse operator used for doing smoothing */ - Dirac *diracCoarseSmoother; + Dirac *diracCoarseSmoother = nullptr; /** The coarse operator used for doing sloppy smoothing */ - Dirac *diracCoarseSmootherSloppy; + Dirac *diracCoarseSmootherSloppy = nullptr; /** Wrapper for the residual coarse grid operator */ - DiracMatrix *matCoarseResidual; + DiracMatrix *matCoarseResidual = nullptr; /** Wrapper for the smoothing coarse grid operator */ - DiracMatrix *matCoarseSmoother; + DiracMatrix *matCoarseSmoother = nullptr; /** Wrapper for the sloppy smoothing coarse grid operator */ - DiracMatrix *matCoarseSmootherSloppy; + DiracMatrix *matCoarseSmootherSloppy = nullptr; /** Parallel hyper-cubic random number generator for generating null-space vectors */ - RNG *rng; + RNG *rng = nullptr; /** @brief Helper function called on entry to each MG function @@ -453,20 +455,20 @@ namespace quda { @brief Load the null space vectors in from file @param B Loaded null-space vectors (pre-allocated) */ - void loadVectors(std::vector &B); + void loadVectors(cvector_ref &B); /** @brief Save the null space vectors in from file @param B Save null-space vectors from here */ - void saveVectors(const std::vector &B) const; + void saveVectors(cvector_ref &B) const; /** @brief Generate the null-space vectors @param B Generated null-space vectors @param refresh Whether we refreshing pre-exising vectors or starting afresh */ - void generateNullVectors(std::vector &B, bool refresh=false); + void generateNullVectors(std::vector &B, bool refresh = false); /** @brief Generate lowest eigenvectors @@ -477,7 +479,7 @@ namespace quda { @brief Build free-field null-space vectors @param B Free-field null-space vectors */ - void buildFreeVectors(std::vector &B); + void buildFreeVectors(std::vector &B); /** @brief Return if we're on a fine grid right now @@ -724,7 +726,7 @@ namespace quda { DiracM *mSmooth; DiracM *mSmoothSloppy; - std::vector B; + std::vector B; MGParam *mgParam; @@ -739,8 +741,6 @@ namespace quda { if (mgParam) delete mgParam; - for (unsigned int i=0; i &B; + const std::vector &B; /** The number of null space components */ const int Nvec; @@ -164,7 +164,7 @@ namespace quda { * @param null_precision The precision to store the null-space basis vectors in * @param enable_gpu Whether to enable this to run on GPU (as well as CPU) */ - Transfer(const std::vector &B, int Nvec, int NblockOrtho, bool blockOrthoTwoPass, int *geo_bs, + Transfer(const std::vector &B, int Nvec, int NblockOrtho, bool blockOrthoTwoPass, int *geo_bs, int spin_bs, QudaPrecision null_precision, const QudaTransferType transfer_type); /** The destructor for Transfer */ @@ -194,7 +194,7 @@ namespace quda { */ QudaPrecision NullPrecision(QudaFieldLocation location) const { - return location == QUDA_CUDA_FIELD_LOCATION ? null_precision : std::max(B[0]->Precision(), QUDA_SINGLE_PRECISION); + return location == QUDA_CUDA_FIELD_LOCATION ? null_precision : std::max(B[0].Precision(), QUDA_SINGLE_PRECISION); } /** @@ -205,7 +205,7 @@ namespace quda { const ColorSpinorField& Vectors(QudaFieldLocation location=QUDA_INVALID_FIELD_LOCATION) const { if (location == QUDA_INVALID_FIELD_LOCATION) { // if not set then we return the memory space where the input vectors are stored - return B[0]->Location() == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h; + return B[0].Location() == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h; } else { return location == QUDA_CUDA_FIELD_LOCATION ? *V_d : *V_h; } @@ -281,11 +281,11 @@ namespace quda { calculation. This this provides better accuracy in fixed-point precision. */ - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass); template - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass); /** diff --git a/lib/block_orthogonalize.in.cpp b/lib/block_orthogonalize.in.cpp index 3912cc0a69..1f3051bbd2 100644 --- a/lib/block_orthogonalize.in.cpp +++ b/lib/block_orthogonalize.in.cpp @@ -7,7 +7,7 @@ namespace quda }; template - void BlockOrthogonalize2(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize2(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass, IntList) { @@ -29,7 +29,7 @@ namespace quda } template - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass, IntList) { @@ -49,7 +49,7 @@ namespace quda } } - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass) { if constexpr (is_enabled_multigrid()) { diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu index 64651fb55f..e4e3a9e12c 100644 --- a/lib/block_orthogonalize.in.cu +++ b/lib/block_orthogonalize.in.cu @@ -49,7 +49,7 @@ namespace quda { template using Arg = BlockOrthoArg; ColorSpinorField &V; - const std::vector B; + const std::vector &B; const int *fine_to_coarse; const int *coarse_to_fine; const int *geo_bs; @@ -61,7 +61,7 @@ namespace quda { double max; public: - BlockOrtho(ColorSpinorField &V, const std::vector B, const int *fine_to_coarse, + BlockOrtho(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int n_block_ortho, bool two_pass) : TunableBlock2D(V, false, chiral_blocks), V(V), @@ -113,8 +113,8 @@ namespace quda { } template - void launch_host_(const TuneParam &tp, const qudaStream_t &stream, - const std::vector &B, std::index_sequence) + void launch_host_(const TuneParam &tp, const qudaStream_t &stream, const std::vector &B, + std::index_sequence) { Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...); launch_host(tp, stream, arg); @@ -122,8 +122,8 @@ namespace quda { } template - void launch_device_(const TuneParam &tp, const qudaStream_t &stream, - const std::vector &B, std::index_sequence) + void launch_device_(const TuneParam &tp, const qudaStream_t &stream, const std::vector &B, + std::index_sequence) { Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...); arg.swizzle_factor = tp.aux.x; @@ -136,7 +136,8 @@ namespace quda { constexpr bool disable_ghost = true; TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (V.Location() == QUDA_CPU_FIELD_LOCATION) { - if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER && B[0]->FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) { + if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER + && B[0].FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) { typedef FieldOrderCB Rotator; typedef FieldOrderCB Vector; launch_host_(tp, stream, B, std::make_index_sequence()); @@ -146,12 +147,12 @@ namespace quda { } else { constexpr auto vOrder = colorspinor::getNative(nSpin); constexpr auto bOrder = colorspinor::getNative(nSpin); - if (V.FieldOrder() == vOrder && B[0]->FieldOrder() == bOrder) { + if (V.FieldOrder() == vOrder && B[0].FieldOrder() == bOrder) { typedef FieldOrderCB Rotator; typedef FieldOrderCB::value> Vector; launch_device_(tp, stream, B, std::make_index_sequence()); } else { - errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0]->FieldOrder()); + errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0].FieldOrder()); } } } @@ -203,7 +204,7 @@ namespace quda { long long bytes() const { - return nVec * B[0]->Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes() + return nVec * B[0].Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes() + (n_block_ortho - 1) * (V.Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()); } @@ -212,7 +213,7 @@ namespace quda { }; template - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int n_block_ortho, bool two_pass) { int geo_blocksize = 1; @@ -233,7 +234,7 @@ namespace quda { } template - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass) { if (!is_enabled_spin(V.Nspin())) errorQuda("nSpin %d has not been built", V.Nspin()); @@ -270,34 +271,35 @@ namespace quda { constexpr int coarseColor = @QUDA_MULTIGRID_NVEC2@; template <> - void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, - const int *coarse_to_fine, const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass) + void BlockOrthogonalize(ColorSpinorField &V, const std::vector &B, + const int *fine_to_coarse, const int *coarse_to_fine, + const int *geo_bs, int spin_bs, int n_block_ortho, bool two_pass) { - if (!is_enabled(V.Precision()) || !is_enabled(B[0]->Precision())) - errorQuda("QUDA_PRECISION=%d does not enable required precision combination (V = %d B = %d)", - QUDA_PRECISION, V.Precision(), B[0]->Precision()); + if (!is_enabled(V.Precision()) || !is_enabled(B[0].Precision())) + errorQuda("QUDA_PRECISION=%d does not enable required precision combination (V = %d B = %d)", QUDA_PRECISION, + V.Precision(), B[0].Precision()); if constexpr (is_enabled_multigrid()) { - if (B[0]->data() == nullptr) { + if (B[0].data() == nullptr) { warningQuda("Trying to BlockOrthogonalize staggered transform, skipping..."); return; } - if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0]->Precision() == QUDA_DOUBLE_PRECISION) { + if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0].Precision() == QUDA_DOUBLE_PRECISION) { if constexpr (is_enabled_multigrid_double()) BlockOrthogonalize(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass); else errorQuda("Double precision multigrid has not been enabled"); - } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) { + } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0].Precision() == QUDA_SINGLE_PRECISION) { if constexpr (is_enabled(QUDA_SINGLE_PRECISION)) BlockOrthogonalize(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass); - } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) { + } else if (V.Precision() == QUDA_HALF_PRECISION && B[0].Precision() == QUDA_SINGLE_PRECISION) { if constexpr (is_enabled(QUDA_HALF_PRECISION) && is_enabled(QUDA_SINGLE_PRECISION)) BlockOrthogonalize(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass); - } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_HALF_PRECISION) { + } else if (V.Precision() == QUDA_HALF_PRECISION && B[0].Precision() == QUDA_HALF_PRECISION) { if constexpr (is_enabled(QUDA_HALF_PRECISION)) BlockOrthogonalize(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho, two_pass); } else { - errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0]->Precision()); + errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0].Precision()); } } else { errorQuda("Multigrid has not been built"); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 995b5758f4..3313db26fe 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -2376,7 +2376,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param) csParam.create = QUDA_REFERENCE_FIELD_CREATE; } - for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField::Create(csParam); } + for (int i = 0; i < mg_param.n_vec[0]; i++) { B[i] = ColorSpinorField(csParam); } // fill out the MG parameters for the fine level mgParam = new MGParam(mg_param, B, m, mSmooth, mSmoothSloppy); diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 89403aef04..03247e2356 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -18,34 +18,11 @@ namespace quda MG::MG(MGParam ¶m) : Solver(*param.matResidual, *param.matSmooth, *param.matSmoothSloppy, *param.matSmoothSloppy, param), param(param), - transfer(0), - resetTransfer(false), - presmoother(nullptr), - postsmoother(nullptr), - coarse(nullptr), - coarse_solver(nullptr), - param_coarse(nullptr), - param_presmooth(nullptr), - param_postsmooth(nullptr), - param_coarse_solver(nullptr), - r(nullptr), - b_tilde(nullptr), - r_coarse(nullptr), - x_coarse(nullptr), - tmp_coarse(nullptr), - tmp_coarse_sloppy(nullptr), xInvKD(nullptr), xInvKD_sloppy(nullptr), diracResidual(param.matResidual->Expose()), diracSmoother(param.matSmooth->Expose()), - diracSmootherSloppy(param.matSmoothSloppy->Expose()), - diracCoarseResidual(nullptr), - diracCoarseSmoother(nullptr), - diracCoarseSmootherSloppy(nullptr), - matCoarseResidual(nullptr), - matCoarseSmoother(nullptr), - matCoarseSmootherSloppy(nullptr), - rng(nullptr) + diracSmootherSloppy(param.matSmoothSloppy->Expose()) { sprintf(prefix, "MG level %d (%s): ", param.level, param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); pushLevel(param.level); @@ -59,7 +36,7 @@ namespace quda // allocating vectors { // create residual vectors - ColorSpinorParam csParam(*(param.B[0])); + ColorSpinorParam csParam(param.B[0]); csParam.create = QUDA_NULL_FIELD_CREATE; csParam.location = param.location; csParam.setPrecision(param.mg_global.invert_param->cuda_prec_sloppy, QUDA_INVALID_PRECISION, @@ -67,7 +44,8 @@ namespace quda if (csParam.location==QUDA_CUDA_FIELD_LOCATION) { csParam.gammaBasis = param.level > 0 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS: QUDA_UKQCD_GAMMA_BASIS; } - if (param.B[0]->Nspin() == 1) csParam.gammaBasis = param.B[0]->GammaBasis(); // hack for staggered to avoid unnecessary basis checks + if (param.B[0].Nspin() == 1) + csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks r = new ColorSpinorField(csParam); // if we're using preconditioning then allocate storage for the preconditioned source vector @@ -78,7 +56,7 @@ namespace quda } } - rng = new RNG(*param.B[0], 1234); + rng = new RNG(param.B[0], 1234); if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) { if (param.level < param.Nlevel - 1) { @@ -88,7 +66,7 @@ namespace quda // Initializing to random vectors for (int i = 0; i < (int)param.B.size(); i++) { spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); - *param.B[i] = *r; + param.B[i] = *r; } } if (param.mg_global.num_setup_iter[param.level] > 0) { @@ -122,7 +100,7 @@ namespace quda void MG::reset(bool refresh) { pushLevel(param.level); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("%s level %d\n", transfer ? "Resetting" : "Creating", param.level); + logQuda(QUDA_VERBOSE, "%s level %d\n", transfer ? "Resetting" : "Creating", param.level); destroySmoother(); destroyCoarseSolver(); @@ -153,50 +131,51 @@ namespace quda } } else { // create transfer operator - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating transfer operator\n"); + logQuda(QUDA_VERBOSE, "Creating transfer operator\n"); transfer = new Transfer(param.B, param.Nvec, param.NblockOrtho, param.blockOrthoTwoPass, param.geoBlockSize, param.spinBlockSize, param.mg_global.precision_null[param.level], param.mg_global.transfer_type[param.level]); - for (int i=0; iCreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); // create coarse residual vector if not already created in verify() if (!r_coarse) - r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); // create coarse solution vector if not already created in verify() if (!x_coarse) - x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); - B_coarse = new std::vector(); int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]); - B_coarse->resize(nVec_coarse); + B_coarse.resize(nVec_coarse); // only have single precision B vectors on the coarse grid QudaPrecision B_coarse_precision = std::max(param.mg_global.precision_null[param.level+1], QUDA_SINGLE_PRECISION); for (int i=0; iCreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, B_coarse_precision, param.mg_global.setup_location[param.level+1]); + B_coarse[i] = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, + B_coarse_precision, param.mg_global.setup_location[param.level + 1]); // if we're not generating on all levels then we need to propagate the vectors down if ((param.level != 0 || param.Nlevel - 1) && param.mg_global.generate_all_levels == QUDA_BOOLEAN_FALSE) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); - for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); + logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); + for (int i = 0; i < param.Nvec; i++) { + zero(B_coarse[i]); + transfer->R(B_coarse[i], param.B[i]); } } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer operator done\n"); + logQuda(QUDA_VERBOSE, "Transfer operator done\n"); } // we no longer need the B fields for this level, can evict them to host memory // (only if using managed memory and prefetching is enabled, otherwise no-op) - for (int i = 0; i < param.Nvec; i++) { param.B[i]->prefetch(QUDA_CPU_FIELD_LOCATION); } + for (int i = 0; i < param.Nvec; i++) { param.B[i].prefetch(QUDA_CPU_FIELD_LOCATION); } createCoarseDirac(); } @@ -219,8 +198,8 @@ namespace quda coarse->reset(refresh); } else { // create the next multigrid level - param_coarse = new MGParam(param, *B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy, - param.level + 1); + param_coarse + = new MGParam(param, B_coarse, matCoarseResidual, matCoarseSmoother, matCoarseSmootherSloppy, param.level + 1); param_coarse->fine = this; param_coarse->delta = 1e-20; param_coarse->precision = param.mg_global.invert_param->cuda_prec_precondition; @@ -238,7 +217,7 @@ namespace quda diracSmoother->prefetch(QUDA_CUDA_FIELD_LOCATION); diracSmootherSloppy->prefetch(QUDA_CUDA_FIELD_LOCATION); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Setup of level %d done\n", param.level); + logQuda(QUDA_VERBOSE, "Setup of level %d done\n", param.level); popLevel(); } @@ -321,7 +300,7 @@ namespace quda pushLevel(param.level); // create the smoother for this level - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating smoother\n"); + logQuda(QUDA_VERBOSE, "Creating smoother\n"); destroySmoother(); param_presmooth = new SolverParam(param); @@ -381,7 +360,7 @@ namespace quda *param.matSmoothSloppy) : nullptr; } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Smoother done\n"); + logQuda(QUDA_VERBOSE, "Smoother done\n"); popLevel(); } @@ -389,7 +368,7 @@ namespace quda void MG::createCoarseDirac() { pushLevel(param.level); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating coarse Dirac operator\n"); + logQuda(QUDA_VERBOSE, "Creating coarse Dirac operator\n"); // check if we are coarsening the preconditioned system then bool preconditioned_coarsen = (param.coarse_grid_solution_type == QUDA_MATPC_SOLUTION && param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE); @@ -415,7 +394,7 @@ namespace quda // Parameters that matter for coarse construction and application diracParam.dirac = preconditioned_coarsen ? const_cast(diracSmoother) : const_cast(diracResidual); - diracParam.kappa = (param.B[0]->Nspin() == 1) ? + diracParam.kappa = (param.B[0].Nspin() == 1) ? -1.0 : diracParam.dirac->Kappa(); // -1 cancels automatic kappa in application of Y fields diracParam.mass = diracParam.dirac->Mass(); @@ -475,7 +454,7 @@ namespace quda matCoarseSmoother = new DiracM(*diracCoarseSmoother); matCoarseSmootherSloppy = new DiracM(*diracCoarseSmootherSloppy); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Coarse Dirac operator done\n"); + logQuda(QUDA_VERBOSE, "Coarse Dirac operator done\n"); popLevel(); } @@ -596,7 +575,7 @@ namespace quda int defl_size = coarse_solver_inner.deflationSpaceSize(); if (defl_size > 0 && transfer && param.mg_global.preserve_deflation) { // Deflation space exists and we are going to create a new solver. Extract deflation space. - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Extracting deflation space size %d to MG\n", defl_size); + logQuda(QUDA_VERBOSE, "Extracting deflation space size %d to MG\n", defl_size); coarse_solver_inner.extractDeflationSpace(evecs); } delete coarse_solver; @@ -616,12 +595,12 @@ namespace quda void MG::createCoarseSolver() { pushLevel(param.level); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating coarse solver wrapper\n"); + logQuda(QUDA_VERBOSE, "Creating coarse solver wrapper\n"); destroyCoarseSolver(); if (param.cycle_type == QUDA_MG_CYCLE_VCYCLE && param.level < param.Nlevel-2) { // if coarse solver is not a bottom solver and on the second to bottom level then we can just use the coarse solver as is coarse_solver = coarse; - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Assigned coarse solver to coarse MG operator\n"); + logQuda(QUDA_VERBOSE, "Assigned coarse solver to coarse MG operator\n"); } else if (param.cycle_type == QUDA_MG_CYCLE_RECURSIVE || param.level == param.Nlevel-2) { param_coarse_solver = new SolverParam(param); @@ -730,8 +709,7 @@ namespace quda // vectors stored in the parent MG instead coarse_solver_inner.setDeflateCompute(false); coarse_solver_inner.setRecomputeEvals(true); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Transferring deflation space size %d to coarse solver\n", defl_size); + logQuda(QUDA_VERBOSE, "Transferring deflation space size %d to coarse solver\n", defl_size); // Create space in coarse solver to hold deflation space, destroy space in MG. coarse_solver_inner.injectDeflationSpace(evecs); } @@ -745,11 +723,11 @@ namespace quda param_coarse_solver->maxiter = param.mg_global.coarse_solver_maxiter[param.level + 1]; } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Assigned coarse solver to preconditioned GCR solver\n"); + logQuda(QUDA_VERBOSE, "Assigned coarse solver to preconditioned GCR solver\n"); } else { errorQuda("Multigrid cycle type %d not supported", param.cycle_type); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Coarse solver wrapper done\n"); + logQuda(QUDA_VERBOSE, "Coarse solver wrapper done\n"); popLevel(); } @@ -765,12 +743,6 @@ namespace quda if (param_coarse_solver) delete param_coarse_solver; } - if (B_coarse) { - int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]); - for (int i = 0; i < nVec_coarse; i++) - if ((*B_coarse)[i]) delete (*B_coarse)[i]; - delete B_coarse; - } if (transfer) delete transfer; if (matCoarseSmootherSloppy) delete matCoarseSmootherSloppy; if (diracCoarseSmootherSloppy) delete diracCoarseSmootherSloppy; @@ -841,7 +813,7 @@ namespace quda for (int i = 0; i < param.Nvec; i++) { // as well as copying to the correct location this also changes basis if necessary - tmp1 = *param.B[i]; + tmp1 = param.B[i]; transfer->R(*r_coarse, tmp1); transfer->P(tmp2, *r_coarse); @@ -867,14 +839,14 @@ namespace quda logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for %d vectors\n", param.Nvec); for (int i = 0; i < param.Nvec; i++) { - transfer->R(*r_coarse, *(param.B[i])); + transfer->R(*r_coarse, param.B[i]); (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore prefix after return from coarse grid transfer->P(tmp2, *x_coarse); (*param.matResidual)(tmp1, tmp2); - tmp2 = *(param.B[i]); - logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(*param.B[i]), norm2(tmp1)); - logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(*param.B[i]))); + tmp2 = param.B[i]; + logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1)); + logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i]))); } sprintf(prefix, "MG level %d (%s): ", param.level + 1, param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); @@ -887,14 +859,14 @@ namespace quda param.Nvec); for (int i=0; iR(*r_coarse, *(param.B[i])); + transfer->R(*r_coarse, param.B[i]); (*coarse)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore output prefix transfer->P(tmp2, *x_coarse); param.matResidual(tmp1, tmp2); - tmp2 = *(param.B[i]); - logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(*param.B[i]), norm2(tmp1)); - logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(*param.B[i])) ); + tmp2 = param.B[i]; + logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(param.B[i]), norm2(tmp1)); + logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i])) ); } #endif @@ -902,18 +874,18 @@ namespace quda // Otherwise these get created in the coarse `reset()` routine later if (!tmp_coarse) - tmp_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); // create coarse residual vector if not already created in verify() if (!r_coarse) - r_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); // create coarse solution vector if not already created in verify() if (!x_coarse) - x_coarse = param.B[0]->CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), + param.mg_global.location[param.level + 1]); { logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P^\\dagger P) eta_c\n"); @@ -1135,16 +1107,16 @@ namespace quda for (int i = 0; i < param.Nvec; i++) { // Restrict Evec, place result in r_coarse - transfer->R(*r_coarse, *param.B[i]); + transfer->R(*r_coarse, param.B[i]); // Prolong r_coarse, place result in tmp2 transfer->P(tmp2, *r_coarse); - printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(*param.B[i]), + printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(param.B[i]), norm2(*r_coarse), norm2(tmp2)); // Compare v_k and PP^dag v_k. - auto max_deviation = blas::max_deviation(tmp2, *param.B[i]); - auto l2_deviation = sqrt(xmyNorm(*param.B[i], tmp2) / norm2(*param.B[i])); + auto max_deviation = blas::max_deviation(tmp2, param.B[i]); + auto l2_deviation = sqrt(xmyNorm(param.B[i], tmp2) / norm2(param.B[i])); printfQuda("L2 relative deviation = %e max deviation = %e\n", l2_deviation, max_deviation[0]); if (param.mg_global.run_oblique_proj_check) { @@ -1156,17 +1128,17 @@ namespace quda // Oblique projections logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for vector %d\n", i); - transfer->R(*r_coarse, *param.B[i]); + transfer->R(*r_coarse, param.B[i]); (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore prefix after return from coarse grid transfer->P(tmp2, *x_coarse); (*param.matResidual)(tmp1, tmp2); - logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(*param.B[i]), + logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(param.B[i]), norm2(tmp1)); - max_deviation = blas::max_deviation(tmp1, *param.B[i]); + max_deviation = blas::max_deviation(tmp1, param.B[i]); logQuda(QUDA_SUMMARIZE, "L2 relative deviation = %e, max deviation = %e\n", - sqrt(xmyNorm(*param.B[i], tmp1) / norm2(*param.B[i])), max_deviation[0]); + sqrt(xmyNorm(param.B[i], tmp1) / norm2(param.B[i])), max_deviation[0]); } sprintf(prefix, "MG level %d (%s): ", param.level + 1, @@ -1314,7 +1286,7 @@ namespace quda } // supports separate reading or single file read - void MG::loadVectors(std::vector &B) + void MG::loadVectors(cvector_ref &B) { if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { warningQuda("Cannot load near-null vectors for top level of staggered MG solve."); @@ -1327,15 +1299,13 @@ namespace quda vec_infile += "_nvec_"; vec_infile += std::to_string(param.mg_global.n_vec[param.level]); VectorIO io(vec_infile); - vector_ref B_ref; - for (auto i = 0u; i < B.size(); i++) B_ref.push_back(*B[i]); - io.load(std::move(B_ref)); + io.load(B); popLevel(); getProfile().TPSTOP(QUDA_PROFILE_IO); } } - void MG::saveVectors(const std::vector &B) const + void MG::saveVectors(cvector_ref &B) const { if (param.transfer_type != QUDA_TRANSFER_AGGREGATE) { warningQuda("Cannot save near-null vectors for top level of staggered MG solve."); @@ -1348,9 +1318,7 @@ namespace quda vec_outfile += "_nvec_"; vec_outfile += std::to_string(param.mg_global.n_vec[param.level]); VectorIO io(vec_outfile, false, param.mg_global.mg_vec_partfile[param.level]); - vector_ref B_ref; - for (auto i = 0u; i < B.size(); i++) B_ref.push_back(*B[i]); - io.save(std::move(B_ref)); + io.save(B); popLevel(); getProfile().TPSTOP(QUDA_PROFILE_IO); } @@ -1366,7 +1334,7 @@ namespace quda if (param.level < param.Nlevel - 2) coarse->dumpNullVectors(); } - void MG::generateNullVectors(std::vector &B, bool refresh) + void MG::generateNullVectors(std::vector &B, bool refresh) { pushLevel(param.level); @@ -1402,11 +1370,11 @@ namespace quda solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; - ColorSpinorParam csParam(*B[0]); // Create spinor field parameters: + ColorSpinorParam csParam(B[0]); // Create spinor field parameters: csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now - csParam.gammaBasis = B[0]->Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : - QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered + csParam.gammaBasis = B[0].Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : + QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered csParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField b(csParam); ColorSpinorField x(csParam); @@ -1417,7 +1385,7 @@ namespace quda bool schwarz_reset = solverParam.inv_type != QUDA_MG_INVERTER && param.mg_global.smoother_schwarz_type[param.level] != QUDA_INVALID_SCHWARZ; if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Disabling Schwarz for null-space finding"); + logQuda(QUDA_VERBOSE, "Disabling Schwarz for null-space finding"); int commDim[QUDA_MAX_DIM]; for (int i = 0; i < QUDA_MAX_DIM; i++) commDim[i] = 1; diracSmootherSloppy->setCommDim(commDim); @@ -1458,54 +1426,55 @@ namespace quda } for (int si = 0; si < param.mg_global.num_setup_iter[param.level]; si++) { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, - param.mg_global.num_setup_iter[param.level]); + logQuda(QUDA_VERBOSE, "Running vectors setup on level %d iter %d of %d\n", param.level, si + 1, + param.mg_global.num_setup_iter[param.level]); // global orthonormalization of the initial null-space vectors - if(param.mg_global.pre_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j + if (param.mg_global.pre_orthonormalize) { + for (auto i = 0u; i < B.size(); i++) { + for (auto j = 0u; j < i; j++) { + Complex alpha = cDotProduct(B[j], B[i]); // + caxpy(-alpha, B[j], B[i]); // i-j } - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ + double nrm2 = norm2(B[i]); + if (nrm2 > 1e-16) + ax(1.0 / sqrt(nrm2), B[i]); // i/ else errorQuda("\nCannot normalize %u vector\n", i); } } // launch solver for each source - for (int i=0; i<(int)B.size(); i++) { + for (auto i = 0u; i < B.size(); i++) { if (param.mg_global.setup_type == QUDA_TEST_VECTOR_SETUP) { // DDalphaAMG test vector idea - b = *B[i]; // inverting against the vector + b = B[i]; // inverting against the vector zero(x); // with zero initial guess } else { - x = *B[i]; + x = B[i]; zero(b); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial guess = %g\n", norm2(x)); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Initial rhs = %g\n", norm2(b)); + logQuda(QUDA_VERBOSE, "Initial guess = %g\n", norm2(x)); + logQuda(QUDA_VERBOSE, "Initial rhs = %g\n", norm2(b)); ColorSpinorField *out=nullptr, *in=nullptr; diracSmoother->prepare(in, out, x, b, QUDA_MAT_SOLUTION); (*solve)(*out, *in); diracSmoother->reconstruct(x, b, QUDA_MAT_SOLUTION); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution = %g\n", norm2(x)); - *B[i] = x; + logQuda(QUDA_VERBOSE, "Solution = %g\n", norm2(x)); + B[i] = x; } // global orthonormalization of the generated null-space vectors if (param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - for (int j=0; j - caxpy(-alpha, *B[j], *B[i]); // i-j + for (auto i = 0u; i < B.size(); i++) { + for (auto j = 0u; j < i; j++) { + Complex alpha = cDotProduct(B[j], B[i]); // + caxpy(-alpha, B[j], B[i]); // i-j } - double nrm2 = norm2(*B[i]); - if (sqrt(nrm2) > 1e-16) ax(1.0/sqrt(nrm2), *B[i]);// i/ + double nrm2 = norm2(B[i]); + if (sqrt(nrm2) > 1e-16) + ax(1.0 / sqrt(nrm2), B[i]); // i/ else errorQuda("\nCannot normalize %u vector (nrm=%e)\n", i, sqrt(nrm2)); } } @@ -1517,12 +1486,12 @@ namespace quda reset(); if ( param.level < param.Nlevel-2 ) { if ( param.mg_global.generate_all_levels == QUDA_BOOLEAN_TRUE ) { - coarse->generateNullVectors(*B_coarse, refresh); + coarse->generateNullVectors(B_coarse, refresh); } else { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Restricting null space vectors\n"); - for (int i=0; iR(*(*B_coarse)[i], *(param.B[i])); + logQuda(QUDA_VERBOSE, "Restricting null space vectors\n"); + for (auto i = 0; i < param.Nvec; i++) { + zero(B_coarse[i]); + transfer->R(B_coarse[i], param.B[i]); } // rebuild the transfer operator in the coarse level coarse->resetTransfer = true; @@ -1543,7 +1512,7 @@ namespace quda // reenable Schwarz if (schwarz_reset) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Reenabling Schwarz for null-space finding"); + logQuda(QUDA_VERBOSE, "Reenabling Schwarz for null-space finding\n"); int commDim[QUDA_MAX_DIM]; for (int i=0; isetCommDim(commDim); @@ -1558,15 +1527,15 @@ namespace quda // generate a full span of free vectors. // FIXME: Assumes fine level is SU(3). - void MG::buildFreeVectors(std::vector &B) + void MG::buildFreeVectors(std::vector &B) { pushLevel(param.level); const int Nvec = B.size(); // Given the number of colors and spins, figure out if the number // of vectors in 'B' makes sense. - const int Ncolor = B[0]->Ncolor(); - const int Nspin = B[0]->Nspin(); + const int Ncolor = B[0].Ncolor(); + const int Nspin = B[0].Nspin(); if (Ncolor == 3) // fine level { @@ -1575,14 +1544,13 @@ namespace quda // There needs to be 6 null vectors -> 12 after chirality. if (Nvec != 6) errorQuda("\nError in MG::buildFreeVectors: Wilson-type fermions require Nvec = 6"); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Building %d free field vectors for Wilson-type fermions\n", Nvec); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Wilson-type fermions\n", Nvec); // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); + for (int i = 0; i < Nvec; i++) zero(B[i]); // Create a temporary vector. - ColorSpinorParam csParam(*B[0]); + ColorSpinorParam csParam(B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField tmp(csParam); @@ -1590,9 +1558,9 @@ namespace quda for (int c = 0; c < Ncolor; c++) { for (int s = 0; s < 2; s++) { tmp.Source(QUDA_CONSTANT_SOURCE, 1, s, c); - xpy(tmp, *B[counter]); + xpy(tmp, B[counter]); tmp.Source(QUDA_CONSTANT_SOURCE, 1, s + 2, c); - xpy(tmp, *B[counter]); + xpy(tmp, B[counter]); counter++; } } @@ -1602,68 +1570,67 @@ namespace quda // There needs to be 24 null vectors -> 48 after chirality. if (Nvec != 24) errorQuda("\nError in MG::buildFreeVectors: Staggered-type fermions require Nvec = 24\n"); - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Building %d free field vectors for Staggered-type fermions\n", Nvec); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Staggered-type fermions\n", Nvec); // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); + for (int i = 0; i < Nvec; i++) zero(B[i]); // Create a temporary vector. - ColorSpinorParam csParam(*B[0]); + ColorSpinorParam csParam(B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField tmp(csParam); // Build free null vectors. - for (int c = 0; c < B[0]->Ncolor(); c++) { + for (int c = 0; c < B[0].Ncolor(); c++) { // Need to pair an even+odd corner together // since they'll get split up. // 0000, 0001 tmp.Source(QUDA_CORNER_SOURCE, 1, 0x0, c); - xpy(tmp, *B[8 * c + 0]); + xpy(tmp, B[8 * c + 0]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0x1, c); - xpy(tmp, *B[8 * c + 0]); + xpy(tmp, B[8 * c + 0]); // 0010, 0011 tmp.Source(QUDA_CORNER_SOURCE, 1, 0x2, c); - xpy(tmp, *B[8 * c + 1]); + xpy(tmp, B[8 * c + 1]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0x3, c); - xpy(tmp, *B[8 * c + 1]); + xpy(tmp, B[8 * c + 1]); // 0100, 0101 tmp.Source(QUDA_CORNER_SOURCE, 1, 0x4, c); - xpy(tmp, *B[8 * c + 2]); + xpy(tmp, B[8 * c + 2]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0x5, c); - xpy(tmp, *B[8 * c + 2]); + xpy(tmp, B[8 * c + 2]); // 0110, 0111 tmp.Source(QUDA_CORNER_SOURCE, 1, 0x6, c); - xpy(tmp, *B[8 * c + 3]); + xpy(tmp, B[8 * c + 3]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0x7, c); - xpy(tmp, *B[8 * c + 3]); + xpy(tmp, B[8 * c + 3]); // 1000, 1001 tmp.Source(QUDA_CORNER_SOURCE, 1, 0x8, c); - xpy(tmp, *B[8 * c + 4]); + xpy(tmp, B[8 * c + 4]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0x9, c); - xpy(tmp, *B[8 * c + 4]); + xpy(tmp, B[8 * c + 4]); // 1010, 1011 tmp.Source(QUDA_CORNER_SOURCE, 1, 0xA, c); - xpy(tmp, *B[8 * c + 5]); + xpy(tmp, B[8 * c + 5]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0xB, c); - xpy(tmp, *B[8 * c + 5]); + xpy(tmp, B[8 * c + 5]); // 1100, 1101 tmp.Source(QUDA_CORNER_SOURCE, 1, 0xC, c); - xpy(tmp, *B[8 * c + 6]); + xpy(tmp, B[8 * c + 6]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0xD, c); - xpy(tmp, *B[8 * c + 6]); + xpy(tmp, B[8 * c + 6]); // 1110, 1111 tmp.Source(QUDA_CORNER_SOURCE, 1, 0xE, c); - xpy(tmp, *B[8 * c + 7]); + xpy(tmp, B[8 * c + 7]); tmp.Source(QUDA_CORNER_SOURCE, 1, 0xF, c); - xpy(tmp, *B[8 * c + 7]); + xpy(tmp, B[8 * c + 7]); } } else { @@ -1674,40 +1641,40 @@ namespace quda // There needs to be Ncolor null vectors. if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor"); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); + for (int i = 0; i < Nvec; i++) zero(B[i]); // Create a temporary vector. - ColorSpinorParam csParam(*B[0]); + ColorSpinorParam csParam(B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField tmp(csParam); for (int c = 0; c < Ncolor; c++) { tmp.Source(QUDA_CONSTANT_SOURCE, 1, 0, c); - xpy(tmp, *B[c]); + xpy(tmp, B[c]); tmp.Source(QUDA_CONSTANT_SOURCE, 1, 1, c); - xpy(tmp, *B[c]); + xpy(tmp, B[c]); } } else if (Nspin == 1) { // There needs to be Ncolor null vectors. if (Nvec != Ncolor) errorQuda("\nError in MG::buildFreeVectors: Coarse fermions require Nvec = Ncolor"); - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Building %d free field vectors for Coarse fermions\n", Ncolor); + logQuda(QUDA_VERBOSE, "Building %d free field vectors for Coarse fermions\n", Ncolor); // Zero the null vectors. - for (int i = 0; i < Nvec; i++) zero(*B[i]); + for (int i = 0; i < Nvec; i++) zero(B[i]); // Create a temporary vector. - ColorSpinorParam csParam(*B[0]); + ColorSpinorParam csParam(B[0]); csParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField tmp(csParam); for (int c = 0; c < Ncolor; c++) { tmp.Source(QUDA_CONSTANT_SOURCE, 1, 0, c); - xpy(tmp, *B[c]); + xpy(tmp, B[c]); } } else { @@ -1717,14 +1684,15 @@ namespace quda // global orthonormalization of the generated null-space vectors if(param.mg_global.post_orthonormalize) { - for(int i=0; i<(int)B.size(); i++) { - double nrm2 = norm2(*B[i]); - if (nrm2 > 1e-16) ax(1.0 /sqrt(nrm2), *B[i]);// i/ + for (auto i = 0u; i < B.size(); i++) { + double nrm2 = norm2(B[i]); + if (nrm2 > 1e-16) + ax(1.0 / sqrt(nrm2), B[i]); // i/ else errorQuda("\nCannot normalize %u vector\n", i); } } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Done building free vectors\n"); + logQuda(QUDA_VERBOSE, "Done building free vectors\n"); popLevel(); } @@ -1739,7 +1707,7 @@ namespace quda bool normop = param.mg_global.eig_param[param.level]->use_norm_op; // Dummy array to keep the eigensolver happy. - ColorSpinorParam csParam(*param.B[0]); + ColorSpinorParam csParam(param.B[0]); csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.create = QUDA_ZERO_FIELD_CREATE; // This is the vector precision used by matResidual @@ -1750,8 +1718,8 @@ namespace quda for (auto &b : B_evecs) b = ColorSpinorField(csParam); // before entering the eigen solver, let's free the B vectors to save some memory - ColorSpinorParam bParam(*param.B[0]); - for (int i = 0; i < (int)param.B.size(); i++) delete param.B[i]; + ColorSpinorParam bParam(param.B[0]); + for (auto &b : param.B) b = ColorSpinorField(); EigenSolver *eig_solve; if (!normop && !dagger) { @@ -1781,9 +1749,10 @@ namespace quda } // now reallocate the B vectors copy in e-vectors - for (int i = 0; i < (int)param.B.size(); i++) { - param.B[i] = new ColorSpinorField(bParam); - *param.B[i] = B_evecs[i]; // FIXME can std::move this + bParam.create = QUDA_NULL_FIELD_CREATE; + for (auto i = 0u; i < param.B.size(); i++) { + param.B[i] = ColorSpinorField(bParam); + param.B[i] = B_evecs[i]; } // only save if outfile is defined diff --git a/lib/transfer.cpp b/lib/transfer.cpp index 895055219c..d67c4c8228 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -19,7 +19,7 @@ namespace quda { * for the staggered case, there is no spin blocking, * however we do even-odd to preserve chirality (that is straightforward) */ - Transfer::Transfer(const std::vector &B, int Nvec, int n_block_ortho, bool block_ortho_two_pass, + Transfer::Transfer(const std::vector &B, int Nvec, int n_block_ortho, bool block_ortho_two_pass, int *geo_bs, int spin_bs, QudaPrecision null_precision, const QudaTransferType transfer_type) : B(B), Nvec(Nvec), @@ -28,26 +28,26 @@ namespace quda { null_precision(null_precision), spin_bs(spin_bs), spin_map(0), - nspin_fine(B[0]->Nspin()), + nspin_fine(B[0].Nspin()), site_subset(QUDA_FULL_SITE_SUBSET), parity(QUDA_INVALID_PARITY), use_gpu(true), transfer_type(transfer_type) { postTrace(); - int ndim = B[0]->Ndim(); + int ndim = B[0].Ndim(); // Only loop over four dimensions for now, we don't have // to worry about the fifth dimension until we hit chiral fermions. for (int d = 0; d < 4; d++) { while (geo_bs[d] > 0) { - if (d==0 && B[0]->X(0) == geo_bs[0]) - warningQuda("X-dimension length %d cannot block length %d", B[0]->X(0), geo_bs[0]); - else if ( (B[0]->X(d)/geo_bs[d]+1)%2 == 0) - warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0]->X(d)/geo_bs[d]); - else if ( (B[0]->X(d)/geo_bs[d]) * geo_bs[d] != B[0]->X(d) ) - warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0]->X(d), geo_bs[d]); - else + if (d == 0 && B[0].X(0) == geo_bs[0]) + warningQuda("X-dimension length %d cannot block length %d", B[0].X(0), geo_bs[0]); + else if ((B[0].X(d) / geo_bs[d] + 1) % 2 == 0) + warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0].X(d) / geo_bs[d]); + else if ((B[0].X(d) / geo_bs[d]) * geo_bs[d] != B[0].X(d)) + warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0].X(d), geo_bs[d]); + else break; // this is a valid block size so let's use it geo_bs[d] /= 2; } @@ -71,11 +71,11 @@ namespace quda { errorQuda("Invalid total geometric block size %d for transfer type optimized-kd, must be 1", total_block_size); // The number of coarse dof is technically fineColor for optimized KD - if (Nvec != B[0]->Ncolor()) - errorQuda("Invalid Nvec %d for optimized-kd aggregation, must be fine color %d", Nvec, B[0]->Ncolor()); + if (Nvec != B[0].Ncolor()) + errorQuda("Invalid Nvec %d for optimized-kd aggregation, must be fine color %d", Nvec, B[0].Ncolor()); } else { - int aggregate_size = total_block_size * B[0]->Ncolor(); + int aggregate_size = total_block_size * B[0].Ncolor(); if (spin_bs == 0) aggregate_size /= 2; // effective spin_bs of 0.5 (fine spin / coarse spin) else @@ -95,23 +95,23 @@ namespace quda { if (Nvec != 24) errorQuda("Invalid number of coarse vectors %d for staggered KD multigrid, must be 24", Nvec); } - createV(B[0]->Location()); // allocate V field + createV(B[0].Location()); // allocate V field createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation) // allocate and compute the fine-to-coarse and coarse-to-fine site maps - fine_to_coarse_h = static_cast(pool_pinned_malloc(B[0]->Volume()*sizeof(int))); - coarse_to_fine_h = static_cast(pool_pinned_malloc(B[0]->Volume()*sizeof(int))); + fine_to_coarse_h = static_cast(pool_pinned_malloc(B[0].Volume() * sizeof(int))); + coarse_to_fine_h = static_cast(pool_pinned_malloc(B[0].Volume() * sizeof(int))); if (enable_gpu) { - fine_to_coarse_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); - coarse_to_fine_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); + fine_to_coarse_d = static_cast(pool_device_malloc(B[0].Volume() * sizeof(int))); + coarse_to_fine_d = static_cast(pool_device_malloc(B[0].Volume() * sizeof(int))); } createGeoMap(geo_bs); // allocate the fine-to-coarse spin map spin_map = static_cast(safe_malloc(nspin_fine*sizeof(int*))); - for (int s = 0; s < B[0]->Nspin(); s++) spin_map[s] = static_cast(safe_malloc(2*sizeof(int))); + for (int s = 0; s < B[0].Nspin(); s++) spin_map[s] = static_cast(safe_malloc(2 * sizeof(int))); createSpinMap(spin_bs); reset(); @@ -123,13 +123,13 @@ namespace quda { postTrace(); // create the storage for the final block orthogonal elements - ColorSpinorParam param(*B[0]); // takes the geometry from the null-space vectors + ColorSpinorParam param(B[0]); // takes the geometry from the null-space vectors // the ordering of the V vector is defined by these parameters and // the Packed functions in ColorSpinorFieldOrder - param.nSpin = B[0]->Nspin(); // spin has direct mapping - param.nColor = B[0]->Ncolor()*Nvec; // nColor = number of colors * number of vectors + param.nSpin = B[0].Nspin(); // spin has direct mapping + param.nColor = B[0].Ncolor() * Nvec; // nColor = number of colors * number of vectors param.nVec = Nvec; param.create = QUDA_NULL_FIELD_CREATE; // the V field is defined on all sites regardless of B field (maybe the B fields are always full?) @@ -141,7 +141,7 @@ namespace quda { param.location = location; param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) : QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - param.setPrecision(location == QUDA_CUDA_FIELD_LOCATION ? null_precision : B[0]->Precision()); + param.setPrecision(location == QUDA_CUDA_FIELD_LOCATION ? null_precision : B[0].Precision()); if (transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) { @@ -173,7 +173,7 @@ namespace quda { if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return; postTrace(); - ColorSpinorParam param(*B[0]); + ColorSpinorParam param(B[0]); param.create = QUDA_NULL_FIELD_CREATE; param.location = location; param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) : @@ -197,7 +197,7 @@ namespace quda { if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized"); // delayed allocating this temporary until we need it - //if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src); + // if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src); createTmp(location, n_src); switch (location) { @@ -205,10 +205,10 @@ namespace quda { if (enable_gpu) return; createV(location); if (transfer_type == QUDA_TRANSFER_AGGREGATE) *V_d = *V_h; - fine_to_coarse_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); - coarse_to_fine_d = static_cast(pool_device_malloc(B[0]->Volume()*sizeof(int))); - qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice); - qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice); + fine_to_coarse_d = static_cast(pool_device_malloc(B[0].Volume() * sizeof(int))); + coarse_to_fine_d = static_cast(pool_device_malloc(B[0].Volume() * sizeof(int))); + qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice); + qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice); break; case QUDA_CPU_FIELD_LOCATION: if (enable_cpu) return; @@ -230,7 +230,7 @@ namespace quda { } if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer: block orthogonalizing\n"); - if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) { + if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) { if (!enable_gpu) errorQuda("enable_gpu = %d so cannot reset", enable_gpu); BlockOrthogonalize(*V_d, B, fine_to_coarse_d, coarse_to_fine_d, geo_bs, spin_bs, NblockOrtho, blockOrthoTwoPass); if (enable_cpu) { @@ -312,14 +312,14 @@ namespace quda { // now create an inverse-like variant of this - std::vector geo_sort(B[0]->Volume()); + std::vector geo_sort(B[0].Volume()); for (unsigned int i=0; iVolume() * sizeof(int), qudaMemcpyHostToDevice); - qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume() * sizeof(int), qudaMemcpyHostToDevice); + qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice); + qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0].Volume() * sizeof(int), qudaMemcpyHostToDevice); } } @@ -333,7 +333,7 @@ namespace quda { } else { - for (int s=0; sNspin(); s++) { + for (int s = 0; s < B[0].Nspin(); s++) { spin_map[s][0] = s / spin_bs; // not staggered, doesn't care about parity. spin_map[s][1] = s / spin_bs; } From 12cf478e0e9b573032765593d86db8252b99774e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 Jan 2024 09:29:34 -0800 Subject: [PATCH 06/21] Move all ColorSpinorFields in multigrid.cpp to objects --- include/color_spinor_field.h | 38 ------- include/multigrid.h | 12 +- lib/color_spinor_field.cpp | 83 -------------- lib/multigrid.cpp | 206 +++++++++++++++++------------------ 4 files changed, 105 insertions(+), 234 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 923678dbaa..d91531b900 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -781,44 +781,6 @@ namespace quda */ ColorSpinorField create_alias(const ColorSpinorParam ¶m = ColorSpinorParam()); - /** - @brief Create a field that aliases this field's storage. The - alias field can use a different precision than this field, - though it cannot be greater. This functionality is useful for - the case where we have multiple temporaries in different - precisions, but do not need them simultaneously. Use this functionality with caution. - @param[in] param Parameters for the alias field - */ - ColorSpinorField *CreateAlias(const ColorSpinorParam ¶m); - - /** - @brief Create a coarse color-spinor field, using this field to set the meta data - @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions - @param[in] spinlockSize Geometric block size that defines the coarse spin dimension - @param[in] Nvec Number of coarse color degrees of freedom per grid point - @param[in] precision Optionally set the precision of the fine field - @param[in] location Optionally set the location of the coarse field - @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory) - */ - ColorSpinorField *CreateCoarse(const int *geoBlockSize, int spinBlockSize, int Nvec, - QudaPrecision precision = QUDA_INVALID_PRECISION, - QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION, - QudaMemoryType mem_Type = QUDA_MEMORY_INVALID); - - /** - @brief Create a fine color-spinor field, using this field to set the meta data - @param[in] geoBlockSize Geometric block size that defines the fine grid dimensions - @param[in] spinlockSize Geometric block size that defines the fine spin dimension - @param[in] Nvec Number of fine color degrees of freedom per grid point - @param[in] precision Optionally set the precision of the fine field - @param[in] location Optionally set the location of the fine field - @param[in] mem_type Optionally set the memory type used (e.g., can override with mapped memory) - */ - ColorSpinorField *CreateFine(const int *geoblockSize, int spinBlockSize, int Nvec, - QudaPrecision precision = QUDA_INVALID_PRECISION, - QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION, - QudaMemoryType mem_type = QUDA_MEMORY_INVALID); - /** @brief Create a coarse color-spinor field, using this field to set the meta data @param[in] geoBlockSize Geometric block size that defines the coarse grid dimensions diff --git a/include/multigrid.h b/include/multigrid.h index 16250e5bdc..a369789dfc 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -295,22 +295,22 @@ namespace quda { std::vector B_coarse; /** Residual vector */ - ColorSpinorField *r = nullptr; + ColorSpinorField r; /** Projected source vector for preconditioned system, else just points to source */ - ColorSpinorField *b_tilde = nullptr; + ColorSpinorField b_tilde; /** Coarse residual vector */ - ColorSpinorField *r_coarse = nullptr; + ColorSpinorField r_coarse; /** Coarse solution vector */ - ColorSpinorField *x_coarse = nullptr; + ColorSpinorField x_coarse; /** Coarse temporary vector */ - ColorSpinorField *tmp_coarse = nullptr; + ColorSpinorField tmp_coarse; /** Sloppy coarse temporary vector */ - ColorSpinorField *tmp_coarse_sloppy = nullptr; + ColorSpinorField tmp_coarse_sloppy; /** Kahler-Dirac Xinv */ std::shared_ptr xInvKD; diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index e8fbfd56d5..9c6c5d0272 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -840,89 +840,6 @@ namespace quda return ColorSpinorField(param); } - ColorSpinorField *ColorSpinorField::CreateAlias(const ColorSpinorParam ¶m_) - { - if (param_.Precision() > precision) - errorQuda("Cannot create an alias to source with lower precision than the alias"); - ColorSpinorParam param(param_); - param.create = QUDA_REFERENCE_FIELD_CREATE; - param.v = data(); - - return new ColorSpinorField(param); - } - - ColorSpinorField *ColorSpinorField::CreateCoarse(const int *geoBlockSize, int spinBlockSize, int Nvec, - QudaPrecision new_precision, QudaFieldLocation new_location, - QudaMemoryType new_mem_type) - { - ColorSpinorParam coarseParam(*this); - for (int d = 0; d < nDim; d++) coarseParam.x[d] = x[d] / geoBlockSize[d]; - - int geoBlockVolume = 1; - for (int d = 0; d < nDim; d++) { geoBlockVolume *= geoBlockSize[d]; } - - // Detect if the "coarse" op is the Kahler-Dirac op or something else - // that still acts on a fine staggered ColorSpinorField - if (geoBlockVolume == 1 && Nvec == nColor && nSpin == 1) { - coarseParam.nSpin = nSpin; - coarseParam.nColor = nColor; - } else { - coarseParam.nSpin = (nSpin == 1) ? 2 : (nSpin / spinBlockSize); // coarsening staggered check - coarseParam.nColor = Nvec; - } - - coarseParam.siteSubset = QUDA_FULL_SITE_SUBSET; // coarse grid is always full - coarseParam.create = QUDA_ZERO_FIELD_CREATE; - - // if new precision is not set, use this->precision - new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision; - - // if new location is not set, use this->location - new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location; - - coarseParam.fieldOrder = (new_location == QUDA_CUDA_FIELD_LOCATION) ? - colorspinor::getNative(new_precision, coarseParam.nSpin) : - QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - - coarseParam.setPrecision(new_precision); - - // set where we allocate the field - coarseParam.mem_type = new_mem_type; - - return new ColorSpinorField(coarseParam); - } - - ColorSpinorField *ColorSpinorField::CreateFine(const int *geoBlockSize, int spinBlockSize, int Nvec, - QudaPrecision new_precision, QudaFieldLocation new_location, - QudaMemoryType new_mem_type) - { - ColorSpinorParam fineParam(*this); - for (int d = 0; d < nDim; d++) fineParam.x[d] = x[d] * geoBlockSize[d]; - fineParam.nSpin = nSpin * spinBlockSize; - fineParam.nColor = Nvec; - fineParam.siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME fine grid is always full - fineParam.create = QUDA_ZERO_FIELD_CREATE; - - // if new precision is not set, use this->precision - new_precision = (new_precision == QUDA_INVALID_PRECISION) ? Precision() : new_precision; - - // if new location is not set, use this->location - new_location = (new_location == QUDA_INVALID_FIELD_LOCATION) ? Location() : new_location; - - // for GPU fields, always use native ordering to ensure coalescing - if (new_location == QUDA_CUDA_FIELD_LOCATION) { - fineParam.setPrecision(new_precision, new_precision, true); - } else { - fineParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - fineParam.setPrecision(new_precision); - } - - // set where we allocate the field - fineParam.mem_type = new_mem_type; - - return new ColorSpinorField(fineParam); - } - ColorSpinorField ColorSpinorField::create_coarse(const int *geoBlockSize, int spinBlockSize, int Nvec, QudaPrecision new_precision, QudaFieldLocation new_location, QudaMemoryType new_mem_type) diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 03247e2356..fac28198d3 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -46,13 +46,13 @@ namespace quda } if (param.B[0].Nspin() == 1) csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks - r = new ColorSpinorField(csParam); + r = ColorSpinorField(csParam); // if we're using preconditioning then allocate storage for the preconditioned source vector if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) { csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; - b_tilde = new ColorSpinorField(csParam); + b_tilde = ColorSpinorField(csParam); } } @@ -65,8 +65,8 @@ namespace quda // Initializing to random vectors for (int i = 0; i < (int)param.B.size(); i++) { - spinorNoise(*r, *rng, QUDA_NOISE_UNIFORM); - param.B[i] = *r; + spinorNoise(r, *rng, QUDA_NOISE_UNIFORM); + param.B[i] = r; } } if (param.mg_global.num_setup_iter[param.level] > 0) { @@ -139,19 +139,19 @@ namespace quda param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i]; // create coarse temporary vector if not already created in verify() - if (!tmp_coarse) - tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (tmp_coarse.empty()) + tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); // create coarse residual vector if not already created in verify() - if (!r_coarse) - r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (r_coarse.empty()) + r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); // create coarse solution vector if not already created in verify() - if (!x_coarse) - x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (x_coarse.empty()) + x_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); int nVec_coarse = std::max(param.Nvec, param.mg_global.n_vec[param.level + 1]); B_coarse.resize(nVec_coarse); @@ -504,10 +504,10 @@ namespace quda xInvKD_sloppy = std::shared_ptr(reinterpret_cast(new GaugeField(xinv_param))); xInvKD_sloppy->copy(*xInvKD); - ColorSpinorParam sloppy_tmp_param(*tmp_coarse); + ColorSpinorParam sloppy_tmp_param(tmp_coarse); sloppy_tmp_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition); - tmp_coarse_sloppy = new ColorSpinorField(sloppy_tmp_param); + tmp_coarse_sloppy = ColorSpinorField(sloppy_tmp_param); } else { // We can just alias fields @@ -677,7 +677,7 @@ namespace quda param_coarse_solver->verbosity_precondition = param.mg_global.verbosity[param.level+1]; // preconditioned solver wrapper is uniform precision - param_coarse_solver->precision = r_coarse->Precision(); + param_coarse_solver->precision = r_coarse.Precision(); param_coarse_solver->precision_sloppy = param_coarse_solver->precision; param_coarse_solver->precision_precondition = param_coarse_solver->precision_sloppy; @@ -716,9 +716,9 @@ namespace quda // Run a dummy solve so that the deflation space is constructed and computed if needed during the MG setup, // or the eigenvalues are recomputed during transfer. - spinorNoise(*r_coarse, *coarse->rng, QUDA_NOISE_UNIFORM); + spinorNoise(r_coarse, *coarse->rng, QUDA_NOISE_UNIFORM); param_coarse_solver->maxiter = 1; // do a single iteration on the dummy solve - (*coarse_solver)(*x_coarse, *r_coarse); + (*coarse_solver)(x_coarse, r_coarse); setOutputPrefix(prefix); // restore since we just popped back from coarse grid param_coarse_solver->maxiter = param.mg_global.coarse_solver_maxiter[param.level + 1]; } @@ -760,14 +760,6 @@ namespace quda if (presmoother) delete presmoother; if (param_presmooth) delete param_presmooth; - - if (b_tilde && param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) delete b_tilde; - if (r) delete r; - if (r_coarse) delete r_coarse; - if (x_coarse) delete x_coarse; - if (tmp_coarse) delete tmp_coarse; - if (tmp_coarse_sloppy) delete tmp_coarse_sloppy; - if (param_coarse) delete param_coarse; popLevel(); @@ -786,7 +778,7 @@ namespace quda pushLevel(param.level); // temporary fields used for verification - ColorSpinorParam csParam(*r); + ColorSpinorParam csParam(r); csParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField tmp1(csParam); ColorSpinorField tmp2(csParam); @@ -815,14 +807,14 @@ namespace quda // as well as copying to the correct location this also changes basis if necessary tmp1 = param.B[i]; - transfer->R(*r_coarse, tmp1); - transfer->P(tmp2, *r_coarse); + transfer->R(r_coarse, tmp1); + transfer->P(tmp2, r_coarse); auto max_deviation = blas::max_deviation(tmp2, tmp1); auto l2_deviation = sqrt(xmyNorm(tmp1, tmp2) / norm2(tmp1)); logQuda( QUDA_VERBOSE, "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n", - i, norm2(tmp1), norm2(*r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]); + i, norm2(tmp1), norm2(r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("k=%d orthonormality failed: L2 relative deviation %e > %e", i, l2_deviation, tol); if (check_deviation(max_deviation[0], tol)) @@ -839,10 +831,10 @@ namespace quda logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for %d vectors\n", param.Nvec); for (int i = 0; i < param.Nvec; i++) { - transfer->R(*r_coarse, param.B[i]); - (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass + transfer->R(r_coarse, param.B[i]); + (*coarse_solver)(x_coarse, r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore prefix after return from coarse grid - transfer->P(tmp2, *x_coarse); + transfer->P(tmp2, x_coarse); (*param.matResidual)(tmp1, tmp2); tmp2 = param.B[i]; logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1)); @@ -859,10 +851,10 @@ namespace quda param.Nvec); for (int i=0; iR(*r_coarse, param.B[i]); - (*coarse)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass + transfer->R(r_coarse, param.B[i]); + (*coarse)(x_coarse, r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore output prefix - transfer->P(tmp2, *x_coarse); + transfer->P(tmp2, x_coarse); param.matResidual(tmp1, tmp2); tmp2 = param.B[i]; logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e ", i, norm2(param.B[i]), norm2(tmp1)); @@ -873,30 +865,30 @@ namespace quda // We need to create temporary coarse vectors here for various verifications // Otherwise these get created in the coarse `reset()` routine later - if (!tmp_coarse) - tmp_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (tmp_coarse.empty()) + tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); // create coarse residual vector if not already created in verify() - if (!r_coarse) - r_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (r_coarse.empty()) + r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); // create coarse solution vector if not already created in verify() - if (!x_coarse) - x_coarse = param.B[0].CreateCoarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r->Precision(), - param.mg_global.location[param.level + 1]); + if (x_coarse.empty()) + x_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), + param.mg_global.location[param.level + 1]); { logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P^\\dagger P) eta_c\n"); - spinorNoise(*x_coarse, *rng, QUDA_NOISE_UNIFORM); - transfer->P(tmp2, *x_coarse); - transfer->R(*r_coarse, tmp2); - auto r2 = norm2(*r_coarse); - auto max_deviation = blas::max_deviation(*r_coarse, *x_coarse); - auto l2_deviation = sqrt(xmyNorm(*x_coarse, *r_coarse) / norm2(*x_coarse)); - logQuda(QUDA_VERBOSE, "L2 norms %e %e (fine tmp %e); Deviations: L2 relative = %e, max = %e\n", norm2(*x_coarse), + spinorNoise(x_coarse, *rng, QUDA_NOISE_UNIFORM); + transfer->P(tmp2, x_coarse); + transfer->R(r_coarse, tmp2); + auto r2 = norm2(r_coarse); + auto max_deviation = blas::max_deviation(r_coarse, x_coarse); + auto l2_deviation = sqrt(xmyNorm(x_coarse, r_coarse) / norm2(x_coarse)); + logQuda(QUDA_VERBOSE, "L2 norms %e %e (fine tmp %e); Deviations: L2 relative = %e, max = %e\n", norm2(x_coarse), r2, norm2(tmp2), l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("coarse span failed: L2 relative deviation = %e > %e", l2_deviation, tol); @@ -905,17 +897,17 @@ namespace quda } logQuda(QUDA_SUMMARIZE, "Checking 0 = (D_c - P^\\dagger D P) (native coarse operator to emulated operator)\n"); - zero(*tmp_coarse); - zero(*r_coarse); + zero(tmp_coarse); + zero(r_coarse); #if 0 // debugging trick: point source matrix elements - tmp_coarse->Source(QUDA_POINT_SOURCE, 0, 0, 0); + tmp_coarse.Source(QUDA_POINT_SOURCE, 0, 0, 0); #else - spinorNoise(*tmp_coarse, *rng, QUDA_NOISE_UNIFORM); + spinorNoise(tmp_coarse, *rng, QUDA_NOISE_UNIFORM); #endif // put a non-trivial vector on the fine level as well - transfer->P(tmp1, *tmp_coarse); + transfer->P(tmp1, tmp_coarse); // the three-hop terms in ASQTAD can break the verification depending on how we're coarsening the operator // and if the aggregate size is too small in a direction @@ -975,8 +967,8 @@ namespace quda (*param.matResidual)(tmp2, tmp1); } - transfer->R(*x_coarse, tmp2); - static_cast(diracCoarseResidual)->M(*r_coarse, *tmp_coarse); + transfer->R(x_coarse, tmp2); + static_cast(diracCoarseResidual)->M(r_coarse, tmp_coarse); #if 0 // enable to print out emulated and actual coarse-grid operator vectors for debugging setOutputPrefix(""); @@ -986,20 +978,20 @@ namespace quda printfQuda("\nemulated\n"); comm_barrier(); for (int parity = 0; parity < 2; parity++) - for (unsigned int x_cb = 0; x_cb < x_coarse->VolumeCB(); x_cb++) x_coarse->PrintVector(parity, x_cb, rank); + for (unsigned int x_cb = 0; x_cb < x_coarse.VolumeCB(); x_cb++) x_coarse.PrintVector(parity, x_cb, rank); comm_barrier(); printfQuda("\nactual\n"); comm_barrier(); for (int parity = 0; parity < 2; parity++) - for (unsigned int x_cb = 0; x_cb < r_coarse->VolumeCB(); x_cb++) r_coarse->PrintVector(parity, x_cb, rank); + for (unsigned int x_cb = 0; x_cb < r_coarse.VolumeCB(); x_cb++) r_coarse.PrintVector(parity, x_cb, rank); } setOutputPrefix(prefix); #endif - double r_nrm = norm2(*r_coarse); - auto max_deviation = blas::max_deviation(*r_coarse, *x_coarse); - auto l2_deviation = sqrt(xmyNorm(*x_coarse, *r_coarse) / norm2(*x_coarse)); + double r_nrm = norm2(r_coarse); + auto max_deviation = blas::max_deviation(r_coarse, x_coarse); + auto l2_deviation = sqrt(xmyNorm(x_coarse, r_coarse) / norm2(x_coarse)); if (diracResidual->Mu() != 0.0) { // When the mu is shifted on the coarse level; we can compute exactly the error we introduce in the check: @@ -1008,13 +1000,13 @@ namespace quda if (fabs(delta_factor) > tol) { double delta_a = delta_factor * 2.0 * diracResidual->Kappa() * diracResidual->Mu() * transfer->Vectors().TwistFlavor(); - l2_deviation -= fabs(delta_a) * sqrt(norm2(*tmp_coarse) / norm2(*x_coarse)); + l2_deviation -= fabs(delta_a) * sqrt(norm2(tmp_coarse) / norm2(x_coarse)); l2_deviation = fabs(l2_deviation); max_deviation[0] -= fabs(delta_a); } } logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n", - norm2(*x_coarse), r_nrm, l2_deviation, max_deviation[0]); + norm2(x_coarse), r_nrm, l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("Coarse operator failed: L2 relative deviation = %e > %e", l2_deviation, tol); @@ -1028,14 +1020,14 @@ namespace quda if (coarse_was_preconditioned) { // check eo logQuda(QUDA_SUMMARIZE, "Checking Deo of preconditioned operator 0 = \\hat{D}_c - A^{-1} D_c\n"); - static_cast(diracCoarseResidual)->Dslash(r_coarse->Even(), tmp_coarse->Odd(), QUDA_EVEN_PARITY); - static_cast(diracCoarseResidual)->CloverInv(x_coarse->Even(), r_coarse->Even(), QUDA_EVEN_PARITY); - static_cast(diracCoarseSmoother)->Dslash(r_coarse->Even(), tmp_coarse->Odd(), QUDA_EVEN_PARITY); - double r_nrm = norm2(r_coarse->Even()); - auto max_deviation = blas::max_deviation(r_coarse->Even(), x_coarse->Even()); - auto l2_deviation = sqrt(xmyNorm(x_coarse->Even(), r_coarse->Even()) / norm2(x_coarse->Even())); + static_cast(diracCoarseResidual)->Dslash(r_coarse.Even(), tmp_coarse.Odd(), QUDA_EVEN_PARITY); + static_cast(diracCoarseResidual)->CloverInv(x_coarse.Even(), r_coarse.Even(), QUDA_EVEN_PARITY); + static_cast(diracCoarseSmoother)->Dslash(r_coarse.Even(), tmp_coarse.Odd(), QUDA_EVEN_PARITY); + double r_nrm = norm2(r_coarse.Even()); + auto max_deviation = blas::max_deviation(r_coarse.Even(), x_coarse.Even()); + auto l2_deviation = sqrt(xmyNorm(x_coarse.Even(), r_coarse.Even()) / norm2(x_coarse.Even())); logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n", - norm2(x_coarse->Even()), r_nrm, l2_deviation, max_deviation[0]); + norm2(x_coarse.Even()), r_nrm, l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("Preconditioned Deo failed: L2 relative deviation = %e > %e", l2_deviation, tol); if (check_deviation(max_deviation[0], tol)) @@ -1043,14 +1035,14 @@ namespace quda // check Doe logQuda(QUDA_SUMMARIZE, "Checking Doe of preconditioned operator 0 = \\hat{D}_c - A^{-1} D_c\n"); - static_cast(diracCoarseResidual)->Dslash(r_coarse->Odd(), tmp_coarse->Even(), QUDA_ODD_PARITY); - static_cast(diracCoarseResidual)->CloverInv(x_coarse->Odd(), r_coarse->Odd(), QUDA_ODD_PARITY); - static_cast(diracCoarseSmoother)->Dslash(r_coarse->Odd(), tmp_coarse->Even(), QUDA_ODD_PARITY); - r_nrm = norm2(r_coarse->Odd()); - max_deviation = blas::max_deviation(r_coarse->Odd(), x_coarse->Odd()); - l2_deviation = sqrt(xmyNorm(x_coarse->Odd(), r_coarse->Odd()) / norm2(x_coarse->Odd())); + static_cast(diracCoarseResidual)->Dslash(r_coarse.Odd(), tmp_coarse.Even(), QUDA_ODD_PARITY); + static_cast(diracCoarseResidual)->CloverInv(x_coarse.Odd(), r_coarse.Odd(), QUDA_ODD_PARITY); + static_cast(diracCoarseSmoother)->Dslash(r_coarse.Odd(), tmp_coarse.Even(), QUDA_ODD_PARITY); + r_nrm = norm2(r_coarse.Odd()); + max_deviation = blas::max_deviation(r_coarse.Odd(), x_coarse.Odd()); + l2_deviation = sqrt(xmyNorm(x_coarse.Odd(), r_coarse.Odd()) / norm2(x_coarse.Odd())); logQuda(QUDA_VERBOSE, "L2 norms: Emulated = %e, Native = %e; Deviations: L2 relative = %e, max = %e\n", - norm2(x_coarse->Odd()), r_nrm, l2_deviation, max_deviation[0]); + norm2(x_coarse.Odd()), r_nrm, l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("Preconditioned Doe failed: L2 relative deviation = %e > %e", l2_deviation, tol); if (check_deviation(max_deviation[0], tol)) @@ -1107,12 +1099,12 @@ namespace quda for (int i = 0; i < param.Nvec; i++) { // Restrict Evec, place result in r_coarse - transfer->R(*r_coarse, param.B[i]); + transfer->R(r_coarse, param.B[i]); // Prolong r_coarse, place result in tmp2 - transfer->P(tmp2, *r_coarse); + transfer->P(tmp2, r_coarse); printfQuda("Vector %d: norms v_k = %e P^dag v_k = %e PP^dag v_k = %e\n", i, norm2(param.B[i]), - norm2(*r_coarse), norm2(tmp2)); + norm2(r_coarse), norm2(tmp2)); // Compare v_k and PP^dag v_k. auto max_deviation = blas::max_deviation(tmp2, param.B[i]); @@ -1128,10 +1120,10 @@ namespace quda // Oblique projections logQuda(QUDA_SUMMARIZE, "Checking 1 > || (1 - DP(P^dagDP)P^dag) v_k || / || v_k || for vector %d\n", i); - transfer->R(*r_coarse, param.B[i]); - (*coarse_solver)(*x_coarse, *r_coarse); // this needs to be an exact solve to pass + transfer->R(r_coarse, param.B[i]); + (*coarse_solver)(x_coarse, r_coarse); // this needs to be an exact solve to pass setOutputPrefix(prefix); // restore prefix after return from coarse grid - transfer->P(tmp2, *x_coarse); + transfer->P(tmp2, x_coarse); (*param.matResidual)(tmp1, tmp2); logQuda(QUDA_SUMMARIZE, "Vector %d: norms v_k %e DP(P^dagDP)P^dag v_k %e\n", i, norm2(param.B[i]), @@ -1194,8 +1186,7 @@ namespace quda diracSmoother->prepare(in, out, x, b, outer_solution_type); // b_tilde holds either a copy of preconditioned source or a pointer to original source - if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) *b_tilde = *in; - else b_tilde = &b; + if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) b_tilde = *in; if (presmoother) (*presmoother)(*out, *in); else zero(*out); @@ -1214,8 +1205,8 @@ namespace quda // FIXME this is currently borked if inner solver is preconditioned ColorSpinorField &residual = !presmoother ? b : use_solver_residual ? presmoother->get_residual() : - b.SiteSubset() == QUDA_FULL_SITE_SUBSET ? *r : - r->Even(); + b.SiteSubset() == QUDA_FULL_SITE_SUBSET ? r : + r.Even(); if (!use_solver_residual && presmoother) { (*param.matResidual)(residual, x); @@ -1228,32 +1219,33 @@ namespace quda if (transfer) { // restrict to the coarse grid - transfer->R(*r_coarse, residual); - if ( debug ) printfQuda("after pre-smoothing x2 = %e, r2 = %e, r_coarse2 = %e\n", norm2(x), r2, norm2(*r_coarse)); + transfer->R(r_coarse, residual); + if (debug) printfQuda("after pre-smoothing x2 = %e, r2 = %e, r_coarse2 = %e\n", norm2(x), r2, norm2(r_coarse)); // recurse to the next lower level - (*coarse_solver)(*x_coarse, *r_coarse); - if (debug) printfQuda("after coarse solve x_coarse2 = %e r_coarse2 = %e\n", norm2(*x_coarse), norm2(*r_coarse)); + (*coarse_solver)(x_coarse, r_coarse); + if (debug) printfQuda("after coarse solve x_coarse2 = %e r_coarse2 = %e\n", norm2(x_coarse), norm2(r_coarse)); // prolongate back to this grid - ColorSpinorField &x_coarse_2_fine = inner_solution_type == QUDA_MAT_SOLUTION ? *r : r->Even(); // define according to inner solution type - transfer->P(x_coarse_2_fine, *x_coarse); // repurpose residual storage + ColorSpinorField &x_coarse_2_fine + = inner_solution_type == QUDA_MAT_SOLUTION ? r : r.Even(); // define according to inner solution type + transfer->P(x_coarse_2_fine, x_coarse); // repurpose residual storage xpy(x_coarse_2_fine, solution); // sum to solution FIXME - sum should be done inside the transfer operator if ( debug ) { - printfQuda("Prolongated coarse solution y2 = %e\n", norm2(*r)); - printfQuda("after coarse-grid correction x2 = %e, r2 = %e\n", norm2(x), norm2(*r)); + printfQuda("Prolongated coarse solution y2 = %e\n", norm2(r)); + printfQuda("after coarse-grid correction x2 = %e, r2 = %e\n", norm2(x), norm2(r)); } } if (debug) printfQuda("preparing to post smooth\n"); // do the post smoothing - //residual = outer_solution_type == QUDA_MAT_SOLUTION ? *r : r->Even(); // refine for outer solution type + // residual = outer_solution_type == QUDA_MAT_SOLUTION ? r : r.Even(); // refine for outer solution type if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) { - in = b_tilde; + in = &b_tilde; } else { // this incurs unecessary copying - *r = b; - in = r; + r = b; + in = &r; } // we should keep a copy of the prepared right hand side as we've already destroyed it @@ -1276,9 +1268,9 @@ namespace quda } // FIXME on subset check - if (debug && b.SiteSubset() == r->SiteSubset()) { - (*param.matResidual)(*r, x); - double r2 = xmyNorm(b, *r); + if (debug && b.SiteSubset() == r.SiteSubset()) { + (*param.matResidual)(r, x); + double r2 = xmyNorm(b, r); printfQuda("leaving V-cycle with x2=%e, r2=%e\n", norm2(x), r2); } @@ -1359,7 +1351,7 @@ namespace quda } solverParam.pipeline = (solverParam.inv_type == QUDA_BICGSTAB_INVERTER ? 0 : 4); // FIXME: pipeline != 0 breaks BICGSTAB - solverParam.precision = r->Precision(); + solverParam.precision = r.Precision(); if (is_fine_grid()) { solverParam.precision_sloppy = param.mg_global.invert_param->cuda_prec_precondition; @@ -1371,7 +1363,7 @@ namespace quda solverParam.residual_type = static_cast(QUDA_L2_RELATIVE_RESIDUAL); solverParam.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; ColorSpinorParam csParam(B[0]); // Create spinor field parameters: - csParam.setPrecision(r->Precision(), r->Precision(), true); // ensure native ordering + csParam.setPrecision(r.Precision(), r.Precision(), true); // ensure native ordering csParam.location = QUDA_CUDA_FIELD_LOCATION; // hard code to GPU location for null-space generation for now csParam.gammaBasis = B[0].Nspin() == 1 ? QUDA_DEGRAND_ROSSI_GAMMA_BASIS : QUDA_UKQCD_GAMMA_BASIS; // degrand-rossi required for staggered From af341fe40caddadae5b5f0e7a914bbc35611d92c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 Jan 2024 12:15:10 -0800 Subject: [PATCH 07/21] Fix for block ortho: prior commit accidentally introduced memory copies of the null space fields --- include/kernels/block_orthogonalize.cuh | 9 ++++----- lib/block_orthogonalize.in.cu | 18 ++++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh index bec77273b7..348d56dcd8 100644 --- a/include/kernels/block_orthogonalize.cuh +++ b/include/kernels/block_orthogonalize.cuh @@ -51,15 +51,14 @@ namespace quda { static constexpr bool swizzle = false; int_fastdiv swizzle_factor; // for transposing blockIdx.x mapping to coarse grid coordinate - const Vector B[nVec]; + Vector B[nVec]; static constexpr bool launch_bounds = true; dim3 grid_dim; dim3 block_dim; - template - BlockOrthoArg(ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity, - const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta, T... B) : + BlockOrthoArg(ColorSpinorField &V, const std::vector &B, const int *fine_to_coarse, const int *coarse_to_fine, int parity, + const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta) : kernel_param(dim3(meta.VolumeCB() * (fineSpin > 1 ? meta.SiteSubset() : 1), 1, chiral_blocks)), V(V), fine_to_coarse(fine_to_coarse), @@ -69,10 +68,10 @@ namespace quda { nParity(meta.SiteSubset()), nBlockOrtho(n_block_ortho), fineVolumeCB(meta.VolumeCB()), - B{B...}, grid_dim(), block_dim() { + for (int i = 0; i < nVec; i++) this->B[i] = B[i]; int aggregate_size = 1; for (int d = 0; d < V.Ndim(); d++) aggregate_size *= geo_bs[d]; aggregate_size_cb = aggregate_size / 2; diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu index e4e3a9e12c..e67e083b94 100644 --- a/lib/block_orthogonalize.in.cu +++ b/lib/block_orthogonalize.in.cu @@ -112,20 +112,18 @@ namespace quda { } } - template - void launch_host_(const TuneParam &tp, const qudaStream_t &stream, const std::vector &B, - std::index_sequence) + template + void launch_host_(const TuneParam &tp, const qudaStream_t &stream) { - Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...); + Arg arg(V, B, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V); launch_host(tp, stream, arg); if (two_pass && iter == 0 && V.Precision() < QUDA_SINGLE_PRECISION && !activeTuning()) max = Rotator(V).abs_max(V); } - template - void launch_device_(const TuneParam &tp, const qudaStream_t &stream, const std::vector &B, - std::index_sequence) + template + void launch_device_(const TuneParam &tp, const qudaStream_t &stream) { - Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...); + Arg arg(V, B, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V); arg.swizzle_factor = tp.aux.x; launch_device(tp, stream, arg); if (two_pass && iter == 0 && V.Precision() < QUDA_SINGLE_PRECISION && !activeTuning()) max = Rotator(V).abs_max(V); @@ -140,7 +138,7 @@ namespace quda { && B[0].FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) { typedef FieldOrderCB Rotator; typedef FieldOrderCB Vector; - launch_host_(tp, stream, B, std::make_index_sequence()); + launch_host_(tp, stream); } else { errorQuda("Unsupported field order %d", V.FieldOrder()); } @@ -150,7 +148,7 @@ namespace quda { if (V.FieldOrder() == vOrder && B[0].FieldOrder() == bOrder) { typedef FieldOrderCB Rotator; typedef FieldOrderCB::value> Vector; - launch_device_(tp, stream, B, std::make_index_sequence()); + launch_device_(tp, stream); } else { errorQuda("Unsupported field order V=%d B=%d", V.FieldOrder(), B[0].FieldOrder()); } From 6b8ddcf5c2d0d6c1c6a77e9cb48f732e5cf90b5e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 Jan 2024 12:16:02 -0800 Subject: [PATCH 08/21] Add n_rhs to restrictor/prolongator aux string --- lib/prolongator.in.cu | 4 ++++ lib/restrictor.in.cu | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu index 8320ca5e8f..d9e16b5f81 100644 --- a/lib/prolongator.in.cu +++ b/lib/prolongator.in.cu @@ -33,6 +33,10 @@ namespace quda { strcat(vol, out[0].VolString().c_str()); strcat(aux, ","); strcat(aux, out[0].AuxString().c_str()); + strcat(aux, ",n_rhs="); + char rhs_str[16]; + i32toa(rhs_str, out.size()); + strcat(aux, rhs_str); apply(device::get_default_stream()); } diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index 27a1bd5c5d..281d5fb0bf 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -38,6 +38,10 @@ namespace quda { strcat(vol, out[0].VolString().c_str()); strcat(aux, ","); strcat(aux, out[0].AuxString().c_str()); + strcat(aux, ",n_rhs="); + char rhs_str[16]; + i32toa(rhs_str, out.size()); + strcat(aux, rhs_str); apply(device::get_default_stream()); } @@ -99,7 +103,7 @@ namespace quda { param.aux.x = 2; // swizzle factor } - long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset()*in[0].VolumeCB(); } + long long flops() const { return out.size() * 8 * fineSpin * fineColor * coarseColor * in[0].SiteSubset() * in[0].VolumeCB(); } long long bytes() const { From 5351201b12d911f9dcfb31aafb796108ee0fdba7 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 06:12:17 -0800 Subject: [PATCH 09/21] Prolongator and restrictor can now do gamma basis on the fly for fine-grid fields with nSpin=4 --- include/color_spinor_field_order.h | 2 +- include/kernels/prolongator.cuh | 24 ++++++++++++++++++---- include/kernels/restrictor.cuh | 33 +++++++++++++++--------------- lib/prolongator.in.cpp | 4 ++++ lib/prolongator.in.cu | 14 +++++++++++-- lib/restrictor.in.cpp | 4 ++++ lib/restrictor.in.cu | 24 +++++++++++++++++----- lib/transfer.cpp | 16 +++------------ 8 files changed, 79 insertions(+), 42 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 022d2a2b69..1f156d73d1 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -914,7 +914,7 @@ namespace quda */ template __device__ __host__ inline void load(complex out[nSpinBlock * nColor * nVec], int parity, int x_cb, - int chi) const + int chi = 0) const { if (!fixed) { accessor.template load((complex *)out, v.v, parity, x_cb, chi, volumeCB); diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh index d4296d41b1..515c409e2c 100644 --- a/include/kernels/prolongator.cuh +++ b/include/kernels/prolongator.cuh @@ -1,4 +1,5 @@ #include +#include #include #include @@ -14,13 +15,14 @@ namespace quda { /** Kernel argument struct */ - template + template struct ProlongateArg : kernel_param<> { using real = Float; static constexpr int fineSpin = fineSpin_; static constexpr int coarseSpin = coarseSpin_; static constexpr int fineColor = fineColor_; static constexpr int coarseColor = coarseColor_; + static constexpr bool to_non_rel = to_non_rel_; // disable ghost to reduce arg size using F = FieldOrderCB(fineSpin), Float, Float, true>; @@ -36,7 +38,7 @@ namespace quda { const spin_mapper spin_map; const int parity; // the parity of the output field (if single parity) const int nParity; // number of parities of input fine field - + ProlongateArg(cvector_ref &out, cvector_ref &in, const ColorSpinorField &v, const int *geo_map, const int parity) : kernel_param(dim3(out[0].VolumeCB(), out[0].SiteSubset() * out.size(), fineColor/fine_colors_per_thread())), @@ -88,6 +90,8 @@ namespace quda { constexpr int color_unroll = 2; + ColorSpinor out; + #pragma unroll for (int s=0; s(2.0)); + } + +#pragma unroll + for (int s = 0; s < Arg::fineSpin; s++) { +#pragma unroll + for (int fine_color_local = 0; fine_color_local < fine_color_per_thread; fine_color_local++) { + int i = fine_color_block + fine_color_local; // global fine color index + arg.out[src_idx](spinor_parity, x_cb, s, i) = out(s, fine_color_local); + } + } } template struct Prolongator diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index 9da7633071..ed2e70d580 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -21,13 +21,14 @@ namespace quda { /** Kernel argument struct */ - template + template struct RestrictArg : kernel_param<> { using real = Float; static constexpr int fineSpin = fineSpin_; static constexpr int fineColor = fineColor_; static constexpr int coarseSpin = coarseSpin_; static constexpr int coarseColor = coarseColor_; + static constexpr bool from_non_rel = from_non_rel_; // disable ghost to reduce arg size using F = FieldOrderCB(fineSpin), Float, Float, true>; @@ -52,6 +53,7 @@ namespace quda { static constexpr bool swizzle = true; int_fastdiv swizzle_factor; // for transposing blockIdx.x mapping to coarse grid coordinate + // when setting n_vector_z we can only use the compile-time part so can't include n_src in this min (not really an issue) static constexpr int n_vector_z = std::min(coarseColor/coarse_colors_per_thread(), max_z_block()); static_assert(n_vector_z > 0, "n_vector_z cannot be less than 1"); @@ -92,33 +94,30 @@ namespace quda { const int v_parity = (arg.v.Nparity() == 2) ? parity : 0; #pragma unroll - for (int s=0; s partial[color_unroll]; -#pragma unroll - for (int k=0; k in; + arg.in[src_idx].load(in.data, spinor_parity, x_cb); -#pragma unroll - for (int j=0; j(2.0)); + } #pragma unroll - for (int k=0; k fineColors; // clang-format on diff --git a/lib/prolongator.in.cu b/lib/prolongator.in.cu index d9e16b5f81..5fcdbd3828 100644 --- a/lib/prolongator.in.cu +++ b/lib/prolongator.in.cu @@ -7,7 +7,8 @@ namespace quda { template class ProlongateLaunch : public TunableKernel3D { - using Arg = ProlongateArg; + template + using Arg = ProlongateArg; cvector_ref &out; cvector_ref ∈ @@ -37,6 +38,7 @@ namespace quda { char rhs_str[16]; i32toa(rhs_str, out.size()); strcat(aux, rhs_str); + if (out[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",to_non_rel"); apply(device::get_default_stream()); } @@ -45,7 +47,15 @@ namespace quda { { if (checkNative(out[0], in[0], V)) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - launch(tp, stream, Arg(out, in, V, fine_to_coarse, parity)); + if constexpr (fineSpin == 4) { + if (out[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, V, fine_to_coarse, parity)); + } else { + launch(tp, stream, Arg(out, in, V, fine_to_coarse, parity)); + } + } else { + launch(tp, stream, Arg(out, in, V, fine_to_coarse, parity)); + } } } diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp index 2f6d2697e0..ca40557846 100644 --- a/lib/restrictor.in.cpp +++ b/lib/restrictor.in.cpp @@ -47,6 +47,10 @@ namespace quda const int *fine_to_coarse, const int *coarse_to_fine, const int *const *spin_map, int parity) { if constexpr (is_enabled_multigrid()) { + if (v.Nspin() != 1 && out[0].GammaBasis() != v.GammaBasis()) + errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d", + out[0].GammaBasis(), in[0].GammaBasis(), v.GammaBasis()); + // clang-format off IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors; // clang-format on diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index 281d5fb0bf..c56482fb49 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -15,7 +15,8 @@ namespace quda { template class RestrictLaunch : public TunableBlock2D { - using Arg = RestrictArg; + template + using Arg = RestrictArg; cvector_ref &out; cvector_ref ∈ const ColorSpinorField &v; @@ -42,6 +43,7 @@ namespace quda { char rhs_str[16]; i32toa(rhs_str, out.size()); strcat(aux, rhs_str); + if (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",from_non_rel"); apply(device::get_default_stream()); } @@ -50,15 +52,27 @@ namespace quda { { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); if (checkNative(out[0], in[0], v)) { - Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity); - arg.swizzle_factor = tp.aux.x; - launch(tp, stream, arg); + if constexpr (fineSpin == 4) { + if (in[0].GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { + Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity); + arg.swizzle_factor = tp.aux.x; + launch(tp, stream, arg); + } else { + Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity); + arg.swizzle_factor = tp.aux.x; + launch(tp, stream, arg); + } + } else { + Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity); + arg.swizzle_factor = tp.aux.x; + launch(tp, stream, arg); + } } } bool advanceAux(TuneParam ¶m) const { - if (Arg::swizzle) { + if (Arg::swizzle) { if (param.aux.x < 2 * (int)device::processor_count()) { param.aux.x++; return true; diff --git a/lib/transfer.cpp b/lib/transfer.cpp index d67c4c8228..cb74a57304 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -377,7 +377,7 @@ namespace quda { } // set output fields - if (out[0].Location() == QUDA_CPU_FIELD_LOCATION || out[0].GammaBasis() != V->GammaBasis()) { + if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) { for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); @@ -402,11 +402,6 @@ namespace quda { if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) errorQuda("Cannot prolongate to a full field since only have single parity null-space components"); - if ((V->Nspin() != 1) && ((output[0].GammaBasis() != V->GammaBasis()) || (input[0].GammaBasis() != V->GammaBasis()))) { - errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d", - output[0].GammaBasis(), in[0].GammaBasis(), V->GammaBasis()); - } - Prolongate(output, input, *V, fine_to_coarse, spin_map, parity); for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically) @@ -422,7 +417,7 @@ namespace quda { { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, 1); + initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size()); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h; @@ -450,12 +445,11 @@ namespace quda { if (use_gpu) { // set input fields - if (in[0].Location() == QUDA_CPU_FIELD_LOCATION || in[0].GammaBasis() != V->GammaBasis()) { + if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) { for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); } else { for (auto i = 0u; i < out.size(); i++) input[i] = const_cast(in[i]).create_alias(); } - if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); // set output fields if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) { @@ -484,10 +478,6 @@ namespace quda { if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) errorQuda("Cannot restrict a full field since only have single parity null-space components"); - if (V->Nspin() != 1 && (output[0].GammaBasis() != V->GammaBasis() || input[0].GammaBasis() != V->GammaBasis())) - errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d", - out[0].GammaBasis(), input[0].GammaBasis(), V->GammaBasis()); - Restrict(output, input, *V, fine_to_coarse, coarse_to_fine, spin_map, parity); for (auto i = 0u; i < out.size(); i++) out[i] = output[i]; // copy result to out field (aliasing handled automatically) From e58f391238fd2099e9a4d3df6f16f9972a27a76d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 06:26:53 -0800 Subject: [PATCH 10/21] Fix bug with restrictor where too large block size could be chosen (leading to inactive threads) --- lib/restrictor.in.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index c56482fb49..ee067bac3e 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -95,7 +95,7 @@ namespace quda { { auto aggregate_size = in[0].Volume() / out[0].Volume(); auto max_block = 128u; - for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size < b) return b; + for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size <= b) return b; return max_block; } From acbd898a327c7ab5317f82aab22eec0dd73ea1ad Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 07:04:20 -0800 Subject: [PATCH 11/21] Some tweaks to improve restrictor m-rhs perf --- include/kernels/restrictor.cuh | 2 +- lib/restrictor.in.cu | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index ed2e70d580..aee1601915 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -16,7 +16,7 @@ namespace quda { //coarseColor >= 8 && coarseColor % 8 == 0 ? 8 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2; } - constexpr int max_z_block() { return 8; } + constexpr int max_z_block() { return 12; } /** Kernel argument struct diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index ee067bac3e..184cfca15f 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -94,7 +94,8 @@ namespace quda { unsigned int blockMapper() const { auto aggregate_size = in[0].Volume() / out[0].Volume(); - auto max_block = 128u; + // for multi-RHS use minimum block size to allow more srcs per block + auto max_block = in.size() > 1 ? blockMin() : 64u; for (uint32_t b = blockMin(); b < max_block; b += blockStep()) if (aggregate_size <= b) return b; return max_block; } From f06b154f3450984c9522118ac29dcdb740fe7d23 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 10:04:58 -0800 Subject: [PATCH 12/21] Remove device temporaries from Transfer class - these are only needed if switching locations, and are now created as needed --- include/field_cache.h | 2 +- include/transfer.h | 27 ++++++++--------- lib/transfer.cpp | 68 +++++++++++++++++++++---------------------- 3 files changed, 48 insertions(+), 49 deletions(-) diff --git a/include/field_cache.h b/include/field_cache.h index 8d4a051ac7..5e05f9acf6 100644 --- a/include/field_cache.h +++ b/include/field_cache.h @@ -123,7 +123,7 @@ namespace quda { @param[in] a Vector of fields we wish to create a matching temporary for */ - template auto getFieldTmp(const vector_ref &a) + template auto getFieldTmp(cvector_ref &a) { std::vector> tmp; tmp.reserve(a.size()); diff --git a/include/transfer.h b/include/transfer.h index 22c4ae1a88..414e90decd 100644 --- a/include/transfer.h +++ b/include/transfer.h @@ -53,16 +53,10 @@ namespace quda { mutable ColorSpinorField *V_d = nullptr; /** A CPU temporary field with fine geometry and fine color we use for changing gamma basis */ - mutable std::vector fine_tmp_h; - - /** A GPU temporary field with fine geometry and fine color we use for changing gamma basis */ - mutable std::vector fine_tmp_d; + mutable ColorSpinorField fine_tmp_h; /** A CPU temporary field with coarse geometry and coarse color */ - mutable std::vector coarse_tmp_h; - - /** A GPU temporary field with coarse geometry and coarse color we use for CPU input / output */ - mutable std::vector coarse_tmp_d; + mutable ColorSpinorField coarse_tmp_h; /** The geometrical coase grid blocking */ int *geo_bs = nullptr; @@ -125,11 +119,19 @@ namespace quda { void createV(QudaFieldLocation location) const; /** - * @brief Allocate temporaries used when applying transfer operators + * @brief Allocate host temporaries used when applying transfer operators * @param[in] location Where to allocate the temporaries - * @param[in] n_src Number of temporaries to allocate */ - void createTmp(QudaFieldLocation location, size_t n_src) const; + void createTmp() const; + + /** + * @brief Allocate temporaries needed for prolongator / restrictor + * application (if changing location) + * @param[in,out] tmp Storaage for temporaries + * @param[in] location Location for new temporaries + * @param[in] a Field whose metadata we are cloning (aside from location and ordering) + */ + void createTmp(std::vector &tmp, QudaFieldLocation new_location, ColorSpinorField &a) const; /** * @brief Creates the map between fine and coarse grids @@ -146,9 +148,8 @@ namespace quda { /** * @brief Lazy allocation of the transfer operator in a given location * @param[in] location Where to allocate the temporaries - * @param[in] n_src Number of temporaries to allocate */ - void initializeLazy(QudaFieldLocation location, size_t n_src) const; + void initializeLazy(QudaFieldLocation location) const; public: /** diff --git a/lib/transfer.cpp b/lib/transfer.cpp index cb74a57304..fc48695190 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -96,7 +96,7 @@ namespace quda { } createV(B[0].Location()); // allocate V field - createTmp(QUDA_CPU_FIELD_LOCATION, 1); // allocate temporaries (needed for geomap creation) + createTmp(); // allocate temporaries (needed for geomap creation) // allocate and compute the fine-to-coarse and coarse-to-fine site maps fine_to_coarse_h = static_cast(pool_pinned_malloc(B[0].Volume() * sizeof(int))); @@ -160,45 +160,33 @@ namespace quda { postTrace(); } - void Transfer::createTmp(QudaFieldLocation location, size_t n_src) const + void Transfer::createTmp() const { // The CPU temporaries are needed for creating geometry mappings. if ((transfer_type == QUDA_TRANSFER_COARSE_KD || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD - || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG) - && location != QUDA_CPU_FIELD_LOCATION) { + || transfer_type == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)) return; - } - if (location == QUDA_CUDA_FIELD_LOCATION && fine_tmp_d.size() == n_src && coarse_tmp_d.size() == n_src) return; - if (location == QUDA_CPU_FIELD_LOCATION && fine_tmp_h.size() == n_src && coarse_tmp_h.size() == n_src) return; + if (!fine_tmp_h.empty()) return; postTrace(); ColorSpinorParam param(B[0]); param.create = QUDA_NULL_FIELD_CREATE; - param.location = location; - param.fieldOrder = location == QUDA_CUDA_FIELD_LOCATION ? colorspinor::getNative(null_precision, param.nSpin) : - QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + param.location = QUDA_CPU_FIELD_LOCATION; + param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; if (param.Precision() < QUDA_SINGLE_PRECISION) param.setPrecision(QUDA_SINGLE_PRECISION); - if (location == QUDA_CUDA_FIELD_LOCATION) { - resize(fine_tmp_d, n_src, param); - coarse_tmp_d.push_back(fine_tmp_d[0].create_coarse(geo_bs, spin_bs, Nvec)); - resize(coarse_tmp_d, n_src, ColorSpinorParam(coarse_tmp_d[0])); - } else { - resize(fine_tmp_h, n_src, param); - coarse_tmp_h.push_back(fine_tmp_h[0].create_coarse(geo_bs, spin_bs, Nvec)); - resize(coarse_tmp_h, n_src, ColorSpinorParam(coarse_tmp_h[0])); - } + fine_tmp_h = ColorSpinorParam(param); + coarse_tmp_h = fine_tmp_h.create_coarse(geo_bs, spin_bs, Nvec); + postTrace(); } - void Transfer::initializeLazy(QudaFieldLocation location, size_t n_src) const + void Transfer::initializeLazy(QudaFieldLocation location) const { if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized"); - // delayed allocating this temporary until we need it - // if (B[0].Location() == QUDA_CUDA_FIELD_LOCATION) createTmp(QUDA_CUDA_FIELD_LOCATION, n_src); - createTmp(location, n_src); + createTmp(); switch (location) { case QUDA_CUDA_FIELD_LOCATION: @@ -289,8 +277,8 @@ namespace quda { int x[QUDA_MAX_DIM]; - ColorSpinorField &fine(fine_tmp_h[0]); - ColorSpinorField &coarse(coarse_tmp_h[0]); + ColorSpinorField &fine(fine_tmp_h); + ColorSpinorField &coarse(coarse_tmp_h); // compute the coarse grid point for every site (assuming parity ordering currently) for (size_t i = 0; i < fine.Volume(); i++) { @@ -340,11 +328,22 @@ namespace quda { } } + void Transfer::createTmp(std::vector &tmp, QudaFieldLocation new_location, ColorSpinorField &a) const + { + ColorSpinorParam param(a); + param.location = QUDA_CUDA_FIELD_LOCATION; + param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; // set to CPU order and override below if needed + param.setPrecision(param.Precision(), param.Precision(), new_location == QUDA_CUDA_FIELD_LOCATION ? true : false); + // ideally we'd want to be able to have tmp[0] on the temp stack as well + tmp[0] = ColorSpinorParam(param); + for (auto i = 1u; i < tmp.size(); i++) tmp[i] = getFieldTmp(tmp[0]); + } + // apply the prolongator void Transfer::P(cvector_ref &out, cvector_ref &in) const { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size()); + initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; if (transfer_type == QUDA_TRANSFER_COARSE_KD) { @@ -371,18 +370,19 @@ namespace quda { // set input fields if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) { - for (auto i = 0u; i < in.size(); i++) input[i] = coarse_tmp_d[i].create_alias(); + createTmp(input, QUDA_CUDA_FIELD_LOCATION, coarse_tmp_h); } else { for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); } // set output fields if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); + createTmp(output, QUDA_CUDA_FIELD_LOCATION, fine_tmp_h); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU"); + } else { // set input fields @@ -390,7 +390,7 @@ namespace quda { // set output fields if (out[0].Location() == QUDA_CUDA_FIELD_LOCATION) { - for (auto i = 0u; i < out.size(); i++) output[i] = (out[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); + createTmp(output, QUDA_CPU_FIELD_LOCATION, fine_tmp_h); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -417,7 +417,7 @@ namespace quda { { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); - initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION, in.size()); + initializeLazy(use_gpu ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION); const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h; const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h; @@ -446,14 +446,14 @@ namespace quda { // set input fields if (in[0].Location() == QUDA_CPU_FIELD_LOCATION) { - for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d[i].create_alias() : fine_tmp_d[i].Even().create_alias(); + createTmp(input, QUDA_CUDA_FIELD_LOCATION, fine_tmp_h); } else { for (auto i = 0u; i < out.size(); i++) input[i] = const_cast(in[i]).create_alias(); } // set output fields if (out[0].Location() == QUDA_CPU_FIELD_LOCATION) { - for (auto i = 0u; i < out.size(); i++) output[i] = coarse_tmp_d[i].create_alias(); + createTmp(output, QUDA_CUDA_FIELD_LOCATION, coarse_tmp_h); } else { for (auto i = 0u; i < out.size(); i++) output[i] = out[i].create_alias(); } @@ -462,15 +462,13 @@ namespace quda { // set input fields if (in[0].Location() == QUDA_CUDA_FIELD_LOCATION) { - for (auto i = 0u; i < in.size(); i++) input[i] = (in[i].SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h[i].create_alias() : fine_tmp_h[i].Even().create_alias(); + createTmp(input, QUDA_CPU_FIELD_LOCATION, coarse_tmp_h); } else { for (auto i = 0u; i < in.size(); i++) input[i] = const_cast(in[i]).create_alias(); } // set output fields - // set input fields for (auto i = 0u; i < out.size(); i++) output[i] = const_cast(out[i]).create_alias(); - } for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not? From 0c38bef0fef01e9a18975538358db1a33c50547d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 11:56:12 -0800 Subject: [PATCH 13/21] getFieldTmp will now change bais on the fly to match the desired kind --- include/color_spinor_field.h | 1 + lib/field_cache.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index d91531b900..453a5e8df4 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -695,6 +695,7 @@ namespace quda QudaSiteOrder SiteOrder() const { return siteOrder; } QudaFieldOrder FieldOrder() const { return fieldOrder; } QudaGammaBasis GammaBasis() const { return gammaBasis; } + void GammaBasis(QudaGammaBasis new_basis) { gammaBasis = new_basis; } const int *GhostFace() const { return ghostFace.data; } const int *GhostFaceCB() const { return ghostFaceCB.data; } diff --git a/lib/field_cache.cpp b/lib/field_cache.cpp index 9089a96f0c..5ea09e3059 100644 --- a/lib/field_cache.cpp +++ b/lib/field_cache.cpp @@ -17,6 +17,10 @@ namespace quda { param.create = QUDA_ZERO_FIELD_CREATE; tmp = T(param); } + + if constexpr (std::is_same_v) { + tmp.GammaBasis(a.GammaBasis()); // ensure gamma basis matches + } } template FieldTmp::FieldTmp(const FieldKey &key, const typename T::param_type ¶m) : key(key) From 540d5f38787972d401a737be6b4163765e4669ba Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 12:01:31 -0800 Subject: [PATCH 14/21] Enable use of multi-RHS prolongator and restrictor for initial orthonormality check in MG::verify() (and some temporary cleanup) --- include/multigrid.h | 9 ----- lib/multigrid.cpp | 89 ++++++++++++++++++++------------------------- 2 files changed, 39 insertions(+), 59 deletions(-) diff --git a/include/multigrid.h b/include/multigrid.h index a369789dfc..35253bd0a6 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -297,21 +297,12 @@ namespace quda { /** Residual vector */ ColorSpinorField r; - /** Projected source vector for preconditioned system, else just points to source */ - ColorSpinorField b_tilde; - /** Coarse residual vector */ ColorSpinorField r_coarse; /** Coarse solution vector */ ColorSpinorField x_coarse; - /** Coarse temporary vector */ - ColorSpinorField tmp_coarse; - - /** Sloppy coarse temporary vector */ - ColorSpinorField tmp_coarse_sloppy; - /** Kahler-Dirac Xinv */ std::shared_ptr xInvKD; diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index fac28198d3..63cd635c34 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -47,13 +47,6 @@ namespace quda if (param.B[0].Nspin() == 1) csParam.gammaBasis = param.B[0].GammaBasis(); // hack for staggered to avoid unnecessary basis checks r = ColorSpinorField(csParam); - - // if we're using preconditioning then allocate storage for the preconditioned source vector - if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) { - csParam.x[0] /= 2; - csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; - b_tilde = ColorSpinorField(csParam); - } } rng = new RNG(param.B[0], 1234); @@ -138,11 +131,6 @@ namespace quda for (int i = 0; i < QUDA_MAX_MG_LEVEL; i++) param.mg_global.geo_block_size[param.level][i] = param.geoBlockSize[i]; - // create coarse temporary vector if not already created in verify() - if (tmp_coarse.empty()) - tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), - param.mg_global.location[param.level + 1]); - // create coarse residual vector if not already created in verify() if (r_coarse.empty()) r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), @@ -184,9 +172,6 @@ namespace quda createSmoother(); if (param.level < param.Nlevel-1) { - // If enabled, verify the coarse links and fine solvers were correctly built - if (param.mg_global.run_verify) verify(); - // creating or resetting the coarse level temporaries and solvers if (coarse) { coarse->param.updateInvertParam(*param.mg_global.invert_param); @@ -209,6 +194,9 @@ namespace quda setOutputPrefix(prefix); // restore since we just popped back from coarse grid createCoarseSolver(); + + // If enabled, verify the coarse links and fine solvers were correctly built + if (param.mg_global.run_verify) verify(); } // We're going back up the coarse construct stack now, prefetch the gauge fields on @@ -504,11 +492,6 @@ namespace quda xInvKD_sloppy = std::shared_ptr(reinterpret_cast(new GaugeField(xinv_param))); xInvKD_sloppy->copy(*xInvKD); - ColorSpinorParam sloppy_tmp_param(tmp_coarse); - sloppy_tmp_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition); - - tmp_coarse_sloppy = ColorSpinorField(sloppy_tmp_param); - } else { // We can just alias fields xInvKD_sloppy = xInvKD; @@ -777,15 +760,8 @@ namespace quda { pushLevel(param.level); - // temporary fields used for verification - ColorSpinorParam csParam(r); - csParam.create = QUDA_NULL_FIELD_CREATE; - ColorSpinorField tmp1(csParam); - ColorSpinorField tmp2(csParam); - - QudaPrecision prec = (param.mg_global.precision_null[param.level] < csParam.Precision()) ? - param.mg_global.precision_null[param.level] : - csParam.Precision(); + QudaPrecision prec = (param.mg_global.precision_null[param.level] < r.Precision()) ? + param.mg_global.precision_null[param.level] : r.Precision(); // may want to revisit this---these were relaxed for cases where ghost_precision < precision // these were set while hacking in tests of quarter precision ghosts @@ -798,28 +774,44 @@ namespace quda default: tol = 1e-8; } + // temporary fields used for verification + std::vector fine_tmp(param.Nvec); + ColorSpinorParam fine_param(r); + fine_param.create = QUDA_NULL_FIELD_CREATE; + for (auto &f : fine_tmp) f = ColorSpinorField(fine_param); + + std::vector coarse_tmp(param.Nvec); + ColorSpinorParam coarse_param(r_coarse); + coarse_param.create = QUDA_NULL_FIELD_CREATE; + for (auto &c : coarse_tmp) c = ColorSpinorField(coarse_param); + + auto &tmp1 = fine_tmp[0]; + auto &tmp2 = fine_tmp[1]; + auto &tmp_coarse = coarse_tmp[0]; + // No need to check (projector) v_k for staggered case if (param.transfer_type == QUDA_TRANSFER_AGGREGATE) { logQuda(QUDA_SUMMARIZE, "Checking 0 = (1 - P P^\\dagger) v_k for %d vectors\n", param.Nvec); - for (int i = 0; i < param.Nvec; i++) { - // as well as copying to the correct location this also changes basis if necessary - tmp1 = param.B[i]; + // change fine_tmp to match B basis to allow comparison + for (auto &f : fine_tmp) f.GammaBasis(param.B[0].GammaBasis()); + transfer->R(coarse_tmp, param.B); + transfer->P(fine_tmp, coarse_tmp); - transfer->R(r_coarse, tmp1); - transfer->P(tmp2, r_coarse); - auto max_deviation = blas::max_deviation(tmp2, tmp1); - auto l2_deviation = sqrt(xmyNorm(tmp1, tmp2) / norm2(tmp1)); + for (auto i = 0; i < param.Nvec; i++) { + auto max_deviation = blas::max_deviation(fine_tmp[i], param.B[i]); + auto l2_deviation = sqrt(xmyNorm(param.B[i], fine_tmp[i]) / norm2(param.B[i])); - logQuda( - QUDA_VERBOSE, "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n", - i, norm2(tmp1), norm2(r_coarse), norm2(tmp2), l2_deviation, max_deviation[0]); + logQuda(QUDA_VERBOSE, + "Vector %d: L2 norms v_k = %e P^\\dagger v_k = %e (1 - P P^\\dagger) v_k = %e; Deviations: L2 relative = %e, max = %e\n", + i, norm2(param.B[i]), norm2(coarse_tmp[i]), norm2(fine_tmp[i]), l2_deviation, max_deviation[0]); if (check_deviation(l2_deviation, tol)) errorQuda("k=%d orthonormality failed: L2 relative deviation %e > %e", i, l2_deviation, tol); if (check_deviation(max_deviation[0], tol)) errorQuda("k=%d orthonormality failed: max deviation %e > %e", i, max_deviation[0], tol); } + for (auto &f : fine_tmp) f.GammaBasis(r.GammaBasis()); // restore basis // the oblique check if (param.mg_global.run_oblique_proj_check) { @@ -840,8 +832,8 @@ namespace quda logQuda(QUDA_SUMMARIZE, "Vector %d: norms %e %e\n", i, norm2(param.B[i]), norm2(tmp1)); logQuda(QUDA_SUMMARIZE, "relative residual = %e\n", sqrt(xmyNorm(tmp2, tmp1) / norm2(param.B[i]))); } - sprintf(prefix, "MG level %d (%s): ", param.level + 1, - param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); + + sprintf(prefix, "MG level %d (%s): ", param.level + 1, param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU"); setOutputPrefix(prefix); } } @@ -862,13 +854,6 @@ namespace quda } #endif - // We need to create temporary coarse vectors here for various verifications - // Otherwise these get created in the coarse `reset()` routine later - - if (tmp_coarse.empty()) - tmp_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), - param.mg_global.location[param.level + 1]); - // create coarse residual vector if not already created in verify() if (r_coarse.empty()) r_coarse = param.B[0].create_coarse(param.geoBlockSize, param.spinBlockSize, param.Nvec, r.Precision(), @@ -1185,8 +1170,12 @@ namespace quda diracSmoother->prepare(in, out, x, b, outer_solution_type); - // b_tilde holds either a copy of preconditioned source or a pointer to original source - if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) b_tilde = *in; + ColorSpinorField b_tilde; + // if we're using preconditioning then allocate storage for the preconditioned source vector + if (param.smoother_solve_type == QUDA_DIRECT_PC_SOLVE) { + b_tilde = getFieldTmp(r.Even()); + b_tilde = *in; // b_tilde holds either a copy of preconditioned source or a pointer to original source + } if (presmoother) (*presmoother)(*out, *in); else zero(*out); From 3fd08a1902fa8d24a95bbe8c833e7f77fc18af9d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 13:27:43 -0800 Subject: [PATCH 15/21] Move fix to avoid copies to ColorspinorField::copy, insted of being in blas::copy: this remove a bunch of unnecessary copies --- include/blas_quda.h | 12 +----------- lib/color_spinor_field.cpp | 9 ++++++++- lib/transfer.cpp | 2 +- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/blas_quda.h b/include/blas_quda.h index 07b09f1209..c2090f24a2 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -28,17 +28,7 @@ namespace quda { for (auto i = 0u; i < x.size(); i++) x[i].zero(); } - inline void copy(ColorSpinorField &dst, const ColorSpinorField &src) - { - if (dst.data() == src.data()) { - // check the fields are equivalent else error - if (ColorSpinorField::are_compatible(dst, src)) - return; - else - errorQuda("Aliasing pointers with incompatible fields"); - } - dst.copy(src); - } + inline void copy(ColorSpinorField &dst, const ColorSpinorField &src) { dst.copy(src); } /** @brief Apply the operation y = a * x diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 9c6c5d0272..f49e76d9cb 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -401,6 +401,12 @@ namespace quda void ColorSpinorField::copy(const ColorSpinorField &src) { + // if these are the same field then return + if (data() == src.data()) { + if (are_compatible(*this, src)) return; + else errorQuda("Aliasing pointers with incompatible fields"); + } + test_compatible_weak(*this, src); if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { @@ -672,7 +678,7 @@ namespace quda bool ColorSpinorField::are_compatible(const ColorSpinorField &a, const ColorSpinorField &b) { - return (a.Precision() == b.Precision() && a.FieldOrder() == b.FieldOrder() && are_compatible_weak(a, b)); + return (a.Precision() == b.Precision() && a.FieldOrder() == b.FieldOrder() && a.GammaBasis() == b.GammaBasis() && are_compatible_weak(a, b)); } void ColorSpinorField::test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b) @@ -691,6 +697,7 @@ namespace quda test_compatible_weak(a, b); if (a.Precision() != b.Precision()) errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); if (a.FieldOrder() != b.FieldOrder()) errorQuda("orders do not match: %d %d", a.FieldOrder(), b.FieldOrder()); + if (a.GammaBasis() != b.GammaBasis()) errorQuda("basis does not match: %d %d", a.GammaBasis(), b.GammaBasis()); } const ColorSpinorField &ColorSpinorField::Even() const diff --git a/lib/transfer.cpp b/lib/transfer.cpp index fc48695190..3e0735b86c 100644 --- a/lib/transfer.cpp +++ b/lib/transfer.cpp @@ -397,7 +397,7 @@ namespace quda { } - for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) FIXME - maybe not? + for (auto i = 0u; i < in.size(); i++) input[i] = in[i]; // copy result to input field (aliasing handled automatically) if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out[0].SiteSubset() == QUDA_FULL_SITE_SUBSET) errorQuda("Cannot prolongate to a full field since only have single parity null-space components"); From ade7923684748a454e2d22c0fe22ab0720855d04 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 13:47:42 -0800 Subject: [PATCH 16/21] Fix restrictor compilation with clang --- include/kernels/restrictor.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index aee1601915..7ca4985b0f 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -105,7 +105,7 @@ namespace quda { int i = coarse_color_block + coarse_color_local; ColorSpinor in; - arg.in[src_idx].load(in.data, spinor_parity, x_cb); + arg.in[src_idx].template load(in.data, spinor_parity, x_cb); if constexpr (Arg::fineSpin == 4 && Arg::from_non_rel) { in.toRel(); From e982208eafeb681deecb7cc37e4c8a6e70296a5d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 14:30:48 -0800 Subject: [PATCH 17/21] Increase m-rhs limit to 96 for prolongator and restrictor --- include/kernels/prolongator.cuh | 2 +- include/kernels/restrictor.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh index 515c409e2c..f9175e3901 100644 --- a/include/kernels/prolongator.cuh +++ b/include/kernels/prolongator.cuh @@ -29,7 +29,7 @@ namespace quda { using C = FieldOrderCB(coarseSpin), Float, Float, true>; using V = FieldOrderCB(fineSpin), vFloat, vFloat>; - static constexpr unsigned int max_n_src = 64; + static constexpr unsigned int max_n_src = 96; const int_fastdiv n_src; F out[max_n_src]; C in[max_n_src]; diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index 7ca4985b0f..4b30874d35 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -35,7 +35,7 @@ namespace quda { using C = FieldOrderCB(coarseSpin), Float, Float, true>; using V = FieldOrderCB(fineSpin), vFloat>; - static constexpr unsigned int max_n_src = 64; + static constexpr unsigned int max_n_src = 96; const int_fastdiv n_src; C out[max_n_src]; From 984146ed401aebe7297cf8e739fbff5f82a64b07 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 17:13:49 -0800 Subject: [PATCH 18/21] Fix spinor_dilute warning that was missed before --- lib/spinor_dilute.in.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu index eadd519c5b..edc28e6267 100644 --- a/lib/spinor_dilute.in.cu +++ b/lib/spinor_dilute.in.cu @@ -91,7 +91,8 @@ namespace quda SpinorDilute(src, v, type, local_block); } else { errorQuda( - "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)"); + "nColor = %d is too large to compile, see QUDA issue #1422 (https://github.com/lattice/quda/issues/1422)", + src.Ncolor()); } } else { if constexpr (sizeof...(N) > 0) From e3cf139ad1646bdd58fcae9d69db444c706bd1af Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 17:16:07 -0800 Subject: [PATCH 19/21] Add batching support to prolongator, restrictor and dslash coarse. Max single kernel size is now set by a CMake paramter QUDA_MAX_MULTI_RHS, which defaults to 32 --- CMakeLists.txt | 4 ++++ include/kernels/dslash_coarse.cuh | 2 +- include/kernels/prolongator.cuh | 2 +- include/kernels/restrictor.cuh | 2 +- include/quda_define.h.in | 7 +++++++ lib/dslash_coarse.in.cpp | 15 +++++++++++++-- lib/prolongator.in.cpp | 9 ++++++++- lib/restrictor.in.cpp | 9 ++++++++- 8 files changed, 43 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e22f66d9cb..03ea9f4fba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,9 @@ if(QUDA_MAX_MULTI_BLAS_N GREATER 32) message(SEND_ERROR "Maximum QUDA_MAX_MULTI_BLAS_N is 32.") endif() +# Set the maximum multi-RHS per kernel +set(QUDA_MAX_MULTI_RHS "32" CACHE STRING "maximum number of simultaneous RHS in a kernel") + set(QUDA_PRECISION "14" CACHE STRING "which precisions to instantiate in QUDA (4-bit number - double, single, half, quarter)") @@ -275,6 +278,7 @@ mark_as_advanced(QUDA_FAST_COMPILE_DSLASH) mark_as_advanced(QUDA_ALTERNATIVE_I_TO_F) mark_as_advanced(QUDA_MAX_MULTI_BLAS_N) +mark_as_advanced(QUDA_MAX_MULTI_RHS) mark_as_advanced(QUDA_PRECISION) mark_as_advanced(QUDA_RECONSTRUCT) mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE) diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh index 744a74f874..531ad18ebe 100644 --- a/include/kernels/dslash_coarse.cuh +++ b/include/kernels/dslash_coarse.cuh @@ -52,7 +52,7 @@ namespace quda { using F = typename colorspinor::FieldOrderCB; using GY = typename gauge::FieldOrder; - static constexpr unsigned int max_n_src = 64; + static constexpr unsigned int max_n_src = MAX_MULTI_RHS; const int_fastdiv n_src; F out[max_n_src]; F inA[max_n_src]; diff --git a/include/kernels/prolongator.cuh b/include/kernels/prolongator.cuh index f9175e3901..046037e24b 100644 --- a/include/kernels/prolongator.cuh +++ b/include/kernels/prolongator.cuh @@ -29,7 +29,7 @@ namespace quda { using C = FieldOrderCB(coarseSpin), Float, Float, true>; using V = FieldOrderCB(fineSpin), vFloat, vFloat>; - static constexpr unsigned int max_n_src = 96; + static constexpr unsigned int max_n_src = MAX_MULTI_RHS; const int_fastdiv n_src; F out[max_n_src]; C in[max_n_src]; diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh index 4b30874d35..e8c3662c7f 100644 --- a/include/kernels/restrictor.cuh +++ b/include/kernels/restrictor.cuh @@ -35,7 +35,7 @@ namespace quda { using C = FieldOrderCB(coarseSpin), Float, Float, true>; using V = FieldOrderCB(fineSpin), vFloat>; - static constexpr unsigned int max_n_src = 96; + static constexpr unsigned int max_n_src = MAX_MULTI_RHS; const int_fastdiv n_src; C out[max_n_src]; diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 850c3990df..445d3cd544 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -18,6 +18,13 @@ */ #define MAX_MULTI_BLAS_N @QUDA_MAX_MULTI_BLAS_N@ +/** + * @def MAX_MULTI_BLAS_N + * @brief This macro sets the limit of blas fusion in the multi-blas + * and multi-reduce kernels + */ +#define MAX_MULTI_RHS @QUDA_MAX_MULTI_RHS@ + #cmakedefine QUDA_HETEROGENEOUS_ATOMIC #ifdef QUDA_HETEROGENEOUS_ATOMIC /** diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp index d2a51102e8..c90b119b42 100644 --- a/lib/dslash_coarse.in.cpp +++ b/lib/dslash_coarse.in.cpp @@ -103,8 +103,19 @@ namespace quda { if constexpr (is_enabled_multigrid()) { if (!DiracCoarse::apply_mma(out, use_mma) || checkLocation(Y, X) == QUDA_CPU_FIELD_LOCATION) { - ApplyCoarse(out, inA, inB, Y, X, kappa, parity, dslash, clover, dagger, commDim, halo_precision, - IntList<@QUDA_MULTIGRID_NVEC_LIST@>()); + + for (auto i = 0u; i < inA.size(); i += MAX_MULTI_RHS) { // batching if needed + auto inA_begin = inA.begin() + i; + auto inA_end = std::min(inA.begin() + (i + MAX_MULTI_RHS), inA.end()); + auto inB_begin = inB.begin() + i; + auto inB_end = std::min(inB.begin() + (i + MAX_MULTI_RHS), inB.end()); + auto out_begin = out.begin() + i; + auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end()); + + ApplyCoarse({out_begin, out_end}, {inA_begin, inA_end}, {inB_begin, inB_end}, + Y, X, kappa, parity, dslash, clover, dagger, commDim, halo_precision, + IntList<@QUDA_MULTIGRID_NVEC_LIST@>()); + } } else { constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; ColorSpinorField v_inA = create_color_spinor_copy(inA, csOrder); diff --git a/lib/prolongator.in.cpp b/lib/prolongator.in.cpp index 286c18ba11..277d5c69f6 100644 --- a/lib/prolongator.in.cpp +++ b/lib/prolongator.in.cpp @@ -54,7 +54,14 @@ namespace quda // clang-format off IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors; // clang-format on - Prolongate(out, in, v, fine_to_coarse, spin_map, parity, fineColors); + + for (auto i = 0u; i < in.size(); i += MAX_MULTI_RHS) { // batching if needed + auto in_begin = in.begin() + i; + auto in_end = std::min(in.begin() + (i + MAX_MULTI_RHS), in.end()); + auto out_begin = out.begin() + i; + auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end()); + Prolongate({out_begin, out_end}, {in_begin, in_end}, v, fine_to_coarse, spin_map, parity, fineColors); + } } else { errorQuda("Multigrid has not been built"); } diff --git a/lib/restrictor.in.cpp b/lib/restrictor.in.cpp index ca40557846..3f12e4e380 100644 --- a/lib/restrictor.in.cpp +++ b/lib/restrictor.in.cpp @@ -54,7 +54,14 @@ namespace quda // clang-format off IntList<@QUDA_MULTIGRID_NC_NVEC_LIST@> fineColors; // clang-format on - Restrict(out, in, v, fine_to_coarse, coarse_to_fine, spin_map, parity, fineColors); + + for (auto i = 0u; i < in.size(); i += MAX_MULTI_RHS) { // batching if needed + auto in_begin = in.begin() + i; + auto in_end = std::min(in.begin() + (i + MAX_MULTI_RHS), in.end()); + auto out_begin = out.begin() + i; + auto out_end = std::min(out.begin() + (i + MAX_MULTI_RHS), out.end()); + Restrict({out_begin, out_end}, {in_begin, in_end}, v, fine_to_coarse, coarse_to_fine, spin_map, parity, fineColors); + } } else { errorQuda("Multigrid has not been built"); } From 5e2385007a1d867e6f5423386ae0c79e158767ea Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 17:16:45 -0800 Subject: [PATCH 20/21] Disable swizzle with restrictor when doing multi-RHS, since it doesn't do anything in this case except make the tuning longer --- lib/restrictor.in.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index 184cfca15f..595694873d 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -72,7 +72,7 @@ namespace quda { bool advanceAux(TuneParam ¶m) const { - if (Arg::swizzle) { + if (Arg::swizzle && in.size() == 1) { if (param.aux.x < 2 * (int)device::processor_count()) { param.aux.x++; return true; From a6aafaff3fd7ed7d4694b78afd1e857fc0dee2f1 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 Jan 2024 17:23:30 -0800 Subject: [PATCH 21/21] On second thoughts, keep the swizzle tuning for small RHS count --- lib/restrictor.in.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu index 595694873d..c25d8a5701 100644 --- a/lib/restrictor.in.cu +++ b/lib/restrictor.in.cu @@ -72,7 +72,7 @@ namespace quda { bool advanceAux(TuneParam ¶m) const { - if (Arg::swizzle && in.size() == 1) { + if (Arg::swizzle && in.size() < 8) { if (param.aux.x < 2 * (int)device::processor_count()) { param.aux.x++; return true;