NVIDIA · caugonnet · Dec 10, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 18, 2024
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Implementation of the DOT kernel using a reduce access mode
+ *
+ */
+
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+int main()
+{
+  const size_t N = 16;
+  double X[N], Y[N];
+
+  double ref_res = 0.0;
+
+  for (size_t i = 0; i < N; i++)
+  {
+    X[i] = cos(double(i));
+    Y[i] = sin(double(i));
+
+    // Compute the reference result of the DOT product of X and Y
+    ref_res += X[i] * Y[i];
+  }
+
+  context ctx;
+  auto lX = ctx.logical_data(X);
+  auto lY = ctx.logical_data(Y);
+
+  auto lsum = ctx.logical_data(shape_of<scalar<double>>());
+
+  /* Compute sum(x_i * y_i)*/
+  ctx.parallel_for(lY.shape(), lX.read(), lY.read(), lsum.reduce(reducer::sum<double>{}))
+      ->*[] __device__(size_t i, auto dX, auto dY, double& sum) {
+            sum += dX(i) * dY(i);
+          };
+
+  double res = ctx.wait(lsum);
+
+  ctx.finalize();
+
+  _CCCL_ASSERT(fabs(res - ref_res) < 0.0001, "Invalid result");
+}
@@ -16,17 +16,18 @@ set(stf_example_sources
 
 # Examples which rely on code generation (parallel_for or launch)
 set(stf_example_codegen_sources
-  01-axpy-parallel_for.cu
   01-axpy-launch.cu
+  01-axpy-parallel_for.cu
   binary_fhe.cu
+  09-dot-reduce.cu
   cfd.cu
   custom_data_interface.cu
+  fdtd_mgpu.cu
   frozen_data_init.cu
   graph_algorithms/degree_centrality.cu
+  graph_algorithms/jaccard.cu
   graph_algorithms/pagerank.cu
   graph_algorithms/tricount.cu
-  graph_algorithms/jaccard.cu
-  fdtd_mgpu.cu
   heat.cu
   heat_mgpu.cu
   jacobi.cu
@@ -35,11 +36,13 @@ set(stf_example_codegen_sources
   launch_sum.cu
   launch_sum_cub.cu
   logical_gates_composition.cu
+  mandelbrot.cu
   parallel_for_2D.cu
+  pi.cu
   scan.cu
-  mandelbrot.cu
   standalone-launches.cu
   word_count.cu
+  word_count_reduce.cu
 )
 
 # Examples using CUBLAS, CUSOLVER...

@@ -188,86 +188,18 @@ void jacobistepvort(
           };
 }
 
-template <typename T>
-T transfer_host(context& ctx, logical_data<slice<T>>& ldata)
+double deltasq(context& ctx, logical_data<slice<double, 2>> lnewarr, logical_data<slice<double, 2>> loldarr)
 {
-  T out;
+  auto ldsq = ctx.logical_data(shape_of<scalar<double>>()).set_symbol("tmp_accumulator");
 
-  bool is_graph = ctx.is_graph_ctx();
-
-  if (is_graph)
-  {
-    ctx.host_launch(ldata.read()).set_symbol("transfer_host")->*[&](auto data) {
-      out = data(0);
-    };
-
-    /* This forces the completion of the host callback, so that the host
-     * thread can use the content for dynamic control flow */
-    cudaStreamSynchronize(ctx.task_fence());
-  }
-  else
-  {
-    ctx.task(exec_place::host, ldata.read()).set_symbol("transfer_host")->*[&](cudaStream_t stream, auto data) {
-      cuda_safe_call(cudaStreamSynchronize(stream));
-      out = data(0);
-    };
-  }
-
-  return out;
-}
-
-double
-deltasq(context& ctx, logical_data<slice<double, 2>> lnewarr, logical_data<slice<double, 2>> loldarr, int m, int n)
-{
-  auto ldsq = ctx.logical_data(shape_of<slice<double>>({1})).set_symbol("tmp_accumulator");
-
-  //
-  //    for (i = 1; i <= m; i++) {
-  //        for (j = 1; j <= n; j++) {
-  //            double tmp = newarr[i * (m + 2) + j] - oldarr[i * (m + 2) + j];
-  //            dsq += tmp * tmp;
-  //        }
-  //    }
-
-  auto spec = con(con<128>(hw_scope::thread));
-  ctx.launch(spec, ldsq.write(), lnewarr.read(), loldarr.read()).set_symbol("deltasq")->*
-    [m, n] __device__(auto th, auto dsq, auto newarr, auto oldarr) {
-      if (th.rank() == 0)
-      {
-        dsq(0) = 0.0;
-      }
-      th.sync();
-
-      // Each thread computes the sum of elements assigned to it
-      double local_sum = 0.0;
-      for (auto [i, j] :
-           th.apply_partition(box<2>({1, m + 1}, {1, n + 1}), std::tuple<blocked_partition, cyclic_partition>()))
-      {
-        double tmp = newarr(i, j) - oldarr(i, j);
-        local_sum += tmp * tmp;
-      }
-
-      auto ti = th.inner();
-
-      __shared__ double block_sum[th.static_width(1)];
-      block_sum[ti.rank()] = local_sum;
-
-      for (size_t s = ti.size() / 2; s > 0; s /= 2)
-      {
-        if (ti.rank() < s)
-        {
-          block_sum[ti.rank()] += block_sum[ti.rank() + s];
-        }
-        ti.sync();
-      }
-
-      if (ti.rank() == 0)
-      {
-        atomicAdd(&dsq(0), block_sum[0]);
-      }
-    };
+  ctx.parallel_for(lnewarr.shape(), ldsq.reduce(reducer::sum<double>{}), lnewarr.read(), loldarr.read())
+      .set_symbol("deltasq")
+      ->*[] __device__(size_t i, size_t j, auto& dsq, auto newarr, auto oldarr) {
+            double tmp = newarr(i, j) - oldarr(i, j);
+            dsq += tmp * tmp;
+          };
 
-  return transfer_host(ctx, ldsq);
+  return ctx.wait(ldsq);
 }
 
 void boundarypsi(context& ctx, logical_data<slice<double, 2>> lpsi, int m, int /*n*/, int b, int h, int w)
@@ -422,44 +354,14 @@ int main(int argc, char** argv)
   boundarypsi(ctx, lpsi, m, n, b, h, w);
 
   // compute normalisation factor for error
-  auto lbnorm = ctx.logical_data(shape_of<slice<double>>({1})).set_symbol("bnorm");
+  auto lbnorm = ctx.logical_data(shape_of<scalar<double>>()).set_symbol("bnorm");
 
   nvtxRangePush("Compute_Normalization");
 
-  // bnorm += psi * psi
-  auto spec = con(con<32>());
-  ctx.launch(spec, lbnorm.write(), lpsi.read()).set_symbol("Compute_Normalization")
-      ->*[] __device__(auto th, auto bnorm, auto psi) {
-            if (th.rank() == 0)
-            {
-              bnorm(0) = 0.0;
-            }
-            th.sync();
-            // Each thread computes the sum of elements assigned to it
-            double local_sum = 0.0;
-            for (auto [i, j] : th.apply_partition(shape(psi)))
-            {
-              local_sum += psi(i, j) * psi(i, j);
-            }
-
-            auto ti = th.inner();
-
-            __shared__ double block_sum[th.static_width(1)];
-            block_sum[ti.rank()] = local_sum;
-
-            for (size_t s = ti.size() / 2; s > 0; s /= 2)
-            {
-              if (ti.rank() < s)
-              {
-                block_sum[ti.rank()] += block_sum[ti.rank() + s];
-              }
-              ti.sync();
-            }
-
-            if (ti.rank() == 0)
-            {
-              atomicAdd(&bnorm(0), block_sum[0]);
-            }
+  // bnorm = psi * psi
+  ctx.parallel_for(lpsi.shape(), lpsi.read(), lbnorm.reduce(reducer::sum<double>{}))
+      ->*[] __device__(size_t i, size_t j, auto psi, auto& bnorm) {
+            bnorm += psi(i, j) * psi(i, j);
           };
 
   if (!irrotational)
@@ -468,37 +370,13 @@ int main(int argc, char** argv)
     boundaryzet(ctx, lzet, lpsi, m, n);
 
     // update normalisation
-    ctx.launch(spec, lbnorm.rw(), lzet.read()).set_symbol("Compute_Normalization")
-        ->*[] __device__(auto th, auto bnorm, auto zet) {
-              // Each thread computes the sum of elements assigned to it
-              double local_sum = 0.0;
-              for (auto [i, j] : th.apply_partition(shape(zet)))
-              {
-                local_sum += zet(i, j) * zet(i, j);
-              }
-
-              auto ti = th.inner();
-
-              __shared__ double block_sum[th.static_width(1)];
-              block_sum[ti.rank()] = local_sum;
-
-              for (size_t s = ti.size() / 2; s > 0; s /= 2)
-              {
-                if (ti.rank() < s)
-                {
-                  block_sum[ti.rank()] += block_sum[ti.rank() + s];
-                }
-                ti.sync();
-              }
-
-              if (ti.rank() == 0)
-              {
-                atomicAdd(&bnorm(0), block_sum[0]);
-              }
+    ctx.parallel_for(lzet.shape(), lzet.read(), lbnorm.reduce(reducer::sum<double>{}, no_init{}))
+        ->*[] __device__(size_t i, size_t j, auto zet, auto& bnorm_zet) {
+              bnorm_zet += zet(i, j) * zet(i, j);
             };
   }
 
-  double bnorm = transfer_host(ctx, lbnorm);
+  double bnorm = ctx.wait(lbnorm);
   bnorm        = sqrt(bnorm);
 
   // begin iterative Jacobi loop
@@ -525,11 +403,11 @@ int main(int argc, char** argv)
     bool compute_error = (iter == numiter) || (checkerr && (iter % printfreq == 0));
     if (compute_error)
     {
-      error = deltasq(ctx, lpsitmp, lpsi, m, n);
+      error = deltasq(ctx, lpsitmp, lpsi);
 
       if (!irrotational)
       {
-        error += deltasq(ctx, lzettmp, lzet, m, n);
+        error += deltasq(ctx, lzettmp, lzet);
       }
 
       error = sqrt(error);

@@ -21,22 +21,6 @@
 
 using namespace cuda::experimental::stf;
 
-/**
- * @brief Performs an atomic maximum operation on floating-point numbers by reinterpreting them as integers.
- *
- * @param address Pointer to the float value that will be updated.
- * @param val     The float value to compare and possibly set at the address.
- * @return        The old value at the address (reinterpreted as a float).
- */
-__device__ float atomicMaxFloat(float* address, float val)
-{
-  int* address_as_int = (int*) address;
-  int old             = *address_as_int;
-  int new_val         = __float_as_int(val);
-  atomicMax(address_as_int, new_val);
-  return __int_as_float(old);
-}
-
 /**
  * @brief Calculates the PageRank for a given vertex.
  *
@@ -49,10 +33,10 @@ __device__ float atomicMaxFloat(float* address, float val)
  */
 __device__ void calculating_pagerank(
   int idx,
-  slice<const int> loffsets,
-  slice<const int> lnonzeros,
-  slice<const float> lpage_rank,
-  slice<float> lnew_page_rank,
+  const slice<const int>& loffsets,
+  const slice<const int>& lnonzeros,
+  const slice<const float>& lpage_rank,
+  slice<float>& lnew_page_rank,
   float init_rank)
 {
   float rank_sum = 0.0;
@@ -77,7 +61,6 @@ int main()
   int num_vertices = offsets.size() - 1;
   float init_rank  = 1.0f / num_vertices;
   float tolerance  = 1e-6f;
-  float max_diff   = 0.0f;
   int NITER        = 100;
 
   // output pageranks for each vertex
@@ -88,34 +71,26 @@ int main()
   auto lnonzeros      = ctx.logical_data(&nonzeros[0], nonzeros.size());
   auto lpage_rank     = ctx.logical_data(&page_rank[0], page_rank.size());
   auto lnew_page_rank = ctx.logical_data(&new_page_rank[0], new_page_rank.size());
-  auto lmax_diff      = ctx.logical_data(&max_diff, {1});
+  auto lmax_diff      = ctx.logical_data(shape_of<scalar<float>>());
 
   for (int iter = 0; iter < NITER; ++iter)
   {
     // Calculate Current Iteration PageRank
-    ctx.parallel_for(box(num_vertices), loffsets.read(), lnonzeros.read(), lpage_rank.rw(), lnew_page_rank.rw())
-        ->*[init_rank] __device__(size_t idx, auto loffsets, auto lnonzeros, auto lpage_rank, auto lnew_page_rank) {
+    ctx.parallel_for(
+      box(num_vertices),
+      loffsets.read(),
+      lnonzeros.read(),
+      lpage_rank.rw(),
+      lnew_page_rank.rw(),
+      lmax_diff.reduce(reducer::maxval<float>{}))
+        ->*[init_rank] __device__(
+             size_t idx, auto loffsets, auto lnonzeros, auto lpage_rank, auto lnew_page_rank, auto& max_diff) {
               calculating_pagerank(idx, loffsets, lnonzeros, lpage_rank, lnew_page_rank, init_rank);
-            };
-
-    // Calculate Current Iteration Error
-    ctx.parallel_for(box(1), lmax_diff.write())->*[] __device__(size_t, auto lmax_diff) {
-      lmax_diff(0) = 0.0f;
-    };
-
-    // Calculate Current Iteration Error
-    ctx.parallel_for(box(num_vertices), lpage_rank.read(), lnew_page_rank.read(), lmax_diff.rw())
-        ->*[] __device__(size_t idx, auto lpage_rank, auto lnew_page_rank, auto lmax_diff) {
-              atomicMaxFloat(lmax_diff.data_handle(), fabs(lnew_page_rank[idx] - lpage_rank[idx]));
+              max_diff = ::std::max(max_diff, lnew_page_rank[idx] - lpage_rank[idx]);
             };
 
     // Reduce Error and Check for Convergence
-    bool converged;
-    ctx.task(exec_place::host, lmax_diff.read())->*[tolerance, &converged](cudaStream_t s, auto max_diff) {
-      cuda_safe_call(cudaStreamSynchronize(s));
-      converged = (max_diff(0) < tolerance);
-    };
-
+    bool converged = (ctx.wait(lmax_diff) < tolerance);
     if (converged)
     {
       break;