fix shared memory consistency checks

lattice · Feb 6, 2024 · 40ddc38 · 40ddc38
1 parent ef04c5c
commit 40ddc38
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 60 deletions.
diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
@@ -55,6 +55,16 @@ namespace quda
     template <template <typename> class Functor, typename Block, unsigned int idx = 0, typename FunctorArg>
     void launch_device(const TuneParam &tp, const qudaStream_t &stream, const FunctorArg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      using BlockArg = BlockKernelArg<Block::block[idx], FunctorArg>;
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<BlockArg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<BlockArg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       // in block == 0, then we aren't templating on block size
       if (tp.block.x == Block::block[idx] || Block::block[idx] == 1) {
         const_cast<FunctorArg &>(arg).grid_dim = tp.grid;
@@ -89,6 +99,16 @@ namespace quda
     template <template <typename> class Functor, typename Block, unsigned int idx = 0, typename Arg>
     void launch_host(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      using BlockArg = BlockKernelArg<Block::block[idx], Arg>;
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<BlockArg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<BlockArg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       if (tp.block.x == Block::block[idx]) {
         const_cast<Arg &>(arg).grid_dim = tp.grid;
         const_cast<Arg &>(arg).block_dim = tp.block;
@@ -119,15 +139,6 @@ namespace quda
     template <template <typename> class Functor, typename Block, bool enable_host = false, typename Arg>
     void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor, Block>(tp, stream, arg);
       } else if constexpr (enable_host) {

diff --git a/include/tunable_nd.h b/include/tunable_nd.h
@@ -35,6 +35,15 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel1D), tp, stream, arg);
     }
 
@@ -47,8 +56,17 @@ namespace quda
        @param[in] arg Kernel argument struct
      */
     template <template <typename> class Functor, typename Arg>
-    void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
+    void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       Kernel1D_host<Functor, Arg>(arg);
     }
 
@@ -64,15 +82,6 @@ namespace quda
     template <template <typename> class Functor, bool enable_host = false, typename Arg>
     void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor, Arg>(tp, stream, arg);
       } else if constexpr (enable_host) {
@@ -191,6 +200,15 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       const_cast<Arg &>(arg).threads.y = vector_length_y;
       TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel2D), tp, stream, arg);
     }
@@ -204,8 +222,17 @@ namespace quda
        @param[in] arg Kernel argument struct
      */
     template <template <typename> class Functor, typename Arg>
-    void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
+    void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       const_cast<Arg &>(arg).threads.y = vector_length_y;
       Kernel2D_host<Functor, Arg>(arg);
     }
@@ -222,15 +249,6 @@ namespace quda
     template <template <typename> class Functor, bool enable_host = false, typename Arg>
     void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (TunableKernel1D_base<grid_stride>::location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor, Arg>(tp, stream, arg);
       } else if constexpr (enable_host) {
@@ -441,6 +459,16 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_device(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       const_cast<Arg &>(arg).threads.y = vector_length_y;
       const_cast<Arg &>(arg).threads.z = vector_length_z;
       TunableKernel::launch_device<Functor, grid_stride>(KERNEL(Kernel3D), tp, stream, arg);
@@ -455,8 +483,18 @@ namespace quda
        @param[in] arg Kernel argument struct
      */
     template <template <typename> class Functor, typename Arg>
-    void launch_host(const TuneParam &, const qudaStream_t &, const Arg &arg)
+    void launch_host(const TuneParam &tp, const qudaStream_t &, const Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       const_cast<Arg &>(arg).threads.y = vector_length_y;
       const_cast<Arg &>(arg).threads.z = vector_length_z;
       Kernel3D_host<Functor, Arg>(arg);
@@ -474,16 +512,6 @@ namespace quda
     template <template <typename> class Functor, bool enable_host = false, typename Arg>
     void launch(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (TunableKernel2D_base<grid_stride>::location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor, Arg>(tp, stream, arg);
       } else if constexpr (enable_host) {

diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
@@ -71,6 +71,15 @@ namespace quda
     template <template <typename> class Functor, typename T, typename Arg>
     void launch_device(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       if (tp.block.x * tp.block.y < static_cast<unsigned>(device::warp_size()))
         errorQuda("Reduction kernels must use at least a warp of threads per block (%u %u < %u)", tp.block.x,
                   tp.block.y, device::warp_size());
@@ -96,8 +105,17 @@ namespace quda
        @param[in] arg Kernel argument struct
      */
     template <template <typename> class Functor, typename T, typename Arg>
-    void launch_host(T &result, const TuneParam &, const qudaStream_t &, Arg &arg)
+    void launch_host(T &result, const TuneParam &tp, const qudaStream_t &, Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       if (arg.threads.y != block_size_y)
         errorQuda("Unexected y threads: received %d, expected %d", arg.threads.y, block_size_y);
       std::vector<T> result_(1);
@@ -122,15 +140,6 @@ namespace quda
     template <template <typename> class Functor, bool enable_host = false, typename T, typename Arg>
     void launch(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor>(result, tp, stream, arg);
       } else if constexpr (enable_host) {
@@ -231,6 +240,15 @@ namespace quda
     template <template <typename> class Functor, typename T, typename Arg>
     void launch_device(std::vector<T> &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       if (tp.block.x * tp.block.y < static_cast<unsigned>(device::warp_size()))
         errorQuda("Reduction kernels must use at least a warp of threads per block (%u %u < %u)", tp.block.x,
                   tp.block.y, device::warp_size());
@@ -254,8 +272,17 @@ namespace quda
        @param[in] arg Kernel argument struct
      */
     template <template <typename> class Functor, typename T, typename Arg>
-    void launch_host(std::vector<T> &result, const TuneParam &, const qudaStream_t &, Arg &arg)
+    void launch_host(std::vector<T> &result, const TuneParam &tp, const qudaStream_t &, Arg &arg)
     {
+#ifdef CHECK_SHARED_BYTES
+      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
+      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
+      if (sizeOps != sizeTp) {
+	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
+	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
+	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
+      }
+#endif
       if (n_batch_block_max > Arg::max_n_batch_block)
         errorQuda("n_batch_block_max = %u greater than maximum supported %u", n_batch_block_max, Arg::max_n_batch_block);
 
@@ -280,15 +307,6 @@ namespace quda
     template <template <typename> class Functor, bool enable_host = false, typename T, typename Arg>
     void launch(T &result, const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
     {
-#ifdef CHECK_SHARED_BYTES
-      auto sizeOps = sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block);
-      auto sizeTp = std::max(this->sharedBytesPerThread() * tp.block.x * tp.block.y * tp.block.z, this->sharedBytesPerBlock(tp));
-      if (sizeOps != sizeTp) {
-	printfQuda("Functor: %s\n", typeid(Functor<Arg>).name());
-	printfQuda("block: %i %i %i\n", tp.block.x, tp.block.y, tp.block.z);
-	errorQuda("Shared bytes mismatch kernel: %u  tp: %u\n", sizeOps, sizeTp);
-      }
-#endif
       if (location == QUDA_CUDA_FIELD_LOCATION) {
         launch_device<Functor>(result, tp, stream, arg);
       } else if constexpr (enable_host) {

diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
@@ -156,6 +156,15 @@ namespace quda {
       }
     }
 
+#if defined(QUDA_TARGET_SYCL)
+    unsigned int sharedBytesPerBlock(const TuneParam &tp) const {
+      using sum_t = double;
+      int mVec = quda::tile_size<nColor, nVec>(tp.block.x);
+      int vsize = 2 * sizeof(sum_t) * mVec;
+      return vsize * (tp.block.x * tp.block.y * tp.block.z) / device::warp_size();
+    }
+#endif
+
 #ifdef SWIZZLE
     bool advanceAux(TuneParam &param) const
     {

diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
@@ -245,8 +245,13 @@ namespace quda {
 
     unsigned int sharedBytesPerBlock(const TuneParam &param) const override
     {
+#if defined(QUDA_TARGET_SYCL)
+      if (type == COMPUTE_VUV || type == COMPUTE_VLV)
+        return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
+#else
       if (arg.shared_atomic && (type == COMPUTE_VUV || type == COMPUTE_VLV))
         return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
+#endif
       return TunableKernel3D::sharedBytesPerBlock(param);
     }