Skip to content

Commit

Permalink
fix shared memory consistency checks
Browse files Browse the repository at this point in the history
  • Loading branch information
jcosborn committed Feb 6, 2024
1 parent ef04c5c commit 40ddc38
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 60 deletions.
29 changes: 20 additions & 9 deletions include/tunable_block_reduction.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ namespace quda
template <template <typename> class Functor, typename Block, unsigned int idx = 0, typename FunctorArg>
void launch_device(const TuneParam &tp, const qudaStream_t &stream, const FunctorArg &arg)
{
#ifdef CHECK_SHARED_BYTES
using BlockArg = BlockKernelArg<Block::block[idx], FunctorArg>;
auto sizeOps = sharedMemSize<getKernelOps<Functor<BlockArg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<BlockArg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
// in block == 0, then we aren't templating on block size
if (tp.block.x == Block::block[idx] || Block::block[idx] == 1) {
const_cast<FunctorArg &>(arg).grid_dim = tp.grid;
Expand Down Expand Up @@ -89,6 +99,16 @@ namespace quda
template <template <typename> class Functor, typename Block, unsigned int idx = 0, typename Arg>
void launch_host(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
using BlockArg = BlockKernelArg<Block::block[idx], Arg>;
auto sizeOps = sharedMemSize<getKernelOps<Functor<BlockArg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<BlockArg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (tp.block.x == Block::block[idx]) {
const_cast<Arg &>(arg).grid_dim = tp.grid;
const_cast<Arg &>(arg).block_dim = tp.block;
Expand Down Expand Up @@ -119,15 +139,6 @@ namespace quda
template <template <typename> class Functor, typename Block, bool enable_host = false, typename Arg>
void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor, Block>(tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down
90 changes: 59 additions & 31 deletions include/tunable_nd.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ namespace quda
template <template <typename> class Functor, typename Arg>
void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel1D), tp, stream, arg);
}

Expand All @@ -47,8 +56,17 @@ namespace quda
@param[in] arg Kernel argument struct
*/
template <template <typename> class Functor, typename Arg>
void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
Kernel1D_host<Functor, Arg>(arg);
}

Expand All @@ -64,15 +82,6 @@ namespace quda
template <template <typename> class Functor, bool enable_host = false, typename Arg>
void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor, Arg>(tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down Expand Up @@ -191,6 +200,15 @@ namespace quda
template <template <typename> class Functor, typename Arg>
void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
const_cast<Arg &>(arg).threads.y = vector_length_y;
TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel2D), tp, stream, arg);
}
Expand All @@ -204,8 +222,17 @@ namespace quda
@param[in] arg Kernel argument struct
*/
template <template <typename> class Functor, typename Arg>
void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
const_cast<Arg &>(arg).threads.y = vector_length_y;
Kernel2D_host<Functor, Arg>(arg);
}
Expand All @@ -222,15 +249,6 @@ namespace quda
template <template <typename> class Functor, bool enable_host = false, typename Arg>
void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (TunableKernel1D_base<grid_stride>::location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor, Arg>(tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down Expand Up @@ -441,6 +459,16 @@ namespace quda
template <template <typename> class Functor, typename Arg>
void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
const_cast<Arg &>(arg).threads.y = vector_length_y;
const_cast<Arg &>(arg).threads.z = vector_length_z;
TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel3D), tp, stream, arg);
Expand All @@ -455,8 +483,18 @@ namespace quda
@param[in] arg Kernel argument struct
*/
template <template <typename> class Functor, typename Arg>
void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
const_cast<Arg &>(arg).threads.y = vector_length_y;
const_cast<Arg &>(arg).threads.z = vector_length_z;
Kernel3D_host<Functor, Arg>(arg);
Expand All @@ -474,16 +512,6 @@ namespace quda
template <template <typename> class Functor, bool enable_host = false, typename Arg>
void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (TunableKernel2D_base<grid_stride>::location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor, Arg>(tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down
58 changes: 38 additions & 20 deletions include/tunable_reduction.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ namespace quda
template <template <typename> class Functor, typename T, typename Arg>
void launch_device(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (tp.block.x * tp.block.y < static_cast<unsigned>(device::warp_size()))
errorQuda("Reduction kernels must use at least a warp of threads per block (%u %u < %u)", tp.block.x,
tp.block.y, device::warp_size());
Expand All @@ -96,8 +105,17 @@ namespace quda
@param[in] arg Kernel argument struct
*/
template <template <typename> class Functor, typename T, typename Arg>
void launch_host(T &result, const TuneParam &, const qudaStream_t &, Arg &arg)
void launch_host(T &result, const TuneParam &tp, const qudaStream_t &, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (arg.threads.y != block_size_y)
errorQuda("Unexected y threads: received %d, expected %d", arg.threads.y, block_size_y);
std::vector<T> result_(1);
Expand All @@ -122,15 +140,6 @@ namespace quda
template <template <typename> class Functor, bool enable_host = false, typename T, typename Arg>
void launch(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor>(result, tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down Expand Up @@ -231,6 +240,15 @@ namespace quda
template <template <typename> class Functor, typename T, typename Arg>
void launch_device(std::vector<T> &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (tp.block.x * tp.block.y < static_cast<unsigned>(device::warp_size()))
errorQuda("Reduction kernels must use at least a warp of threads per block (%u %u < %u)", tp.block.x,
tp.block.y, device::warp_size());
Expand All @@ -254,8 +272,17 @@ namespace quda
@param[in] arg Kernel argument struct
*/
template <template <typename> class Functor, typename T, typename Arg>
void launch_host(std::vector<T> &result, const TuneParam &, const qudaStream_t &, Arg &arg)
void launch_host(std::vector<T> &result, const TuneParam &tp, const qudaStream_t &, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (n_batch_block_max > Arg::max_n_batch_block)
errorQuda("n_batch_block_max = %u greater than maximum supported %u", n_batch_block_max, Arg::max_n_batch_block);

Expand All @@ -280,15 +307,6 @@ namespace quda
template <template <typename> class Functor, bool enable_host = false, typename T, typename Arg>
void launch(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
{
#ifdef CHECK_SHARED_BYTES
auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
if (sizeOps != sizeTp) {
printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
errorQuda("Shared bytes mismatch kernel: %u tp: %u\n", sizeOps, sizeTp);
}
#endif
if (location == QUDA_CUDA_FIELD_LOCATION) {
launch_device<Functor>(result, tp, stream, arg);
} else if constexpr (enable_host) {
Expand Down
9 changes: 9 additions & 0 deletions lib/block_orthogonalize.in.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,15 @@ namespace quda {
}
}

#if defined(QUDA_TARGET_SYCL)
unsigned int sharedBytesPerBlock(const TuneParam &tp) const {
using sum_t = double;
int mVec = quda::tile_size<nColor, nVec>(tp.block.x);
int vsize = 2 * sizeof(sum_t) * mVec;
return vsize * (tp.block.x * tp.block.y * tp.block.z) / device::warp_size();
}
#endif

#ifdef SWIZZLE
bool advanceAux(TuneParam &param) const
{
Expand Down
5 changes: 5 additions & 0 deletions lib/coarse_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,13 @@ namespace quda {

unsigned int sharedBytesPerBlock(const TuneParam &param) const override
{
#if defined(QUDA_TARGET_SYCL)
if (type == COMPUTE_VUV || type == COMPUTE_VLV)
return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
#else
if (arg.shared_atomic && (type == COMPUTE_VUV || type == COMPUTE_VLV))
return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
#endif
return TunableKernel3D::sharedBytesPerBlock(param);
}

Expand Down

0 comments on commit 40ddc38

Please sign in to comment.