Removing alloca on GPU (#618)

When building cuFINUFFT natively on your machine, you observed a performance regression but lower memory utilization. This was caused by the use of dynamic stack allocation to allocate memory for the kernel tensors. Previously, the approach was to allocate MAX_ARRAY on the stack. However, since the GPU stack is relatively small, this would spill over into global memory, leading to higher memory utilization. In my benchmarks, I did not observe a performance regression when using alloca (dynamic stack allocation), which differs from your experience. To achieve the best of both worlds, this PR now avoidis alloca and instead using a template recursion trick to generate different CUDA kernels based on varying spreading width values. This approach eliminates the need for alloca while ensuring that no more memory is used than necessary.
flatironinstitute · Feb 11, 2025 · 73412b4 · 73412b4
1 parent e217279
commit 73412b4
Show file tree

Hide file tree

Showing 16 changed files with 825 additions and 872 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master, using release name V 2.4.0 (1/7/25)
+* PR618: removing alloca and using kernel dispatch on the GPU.
 * PR617: Caching pip dependencies in github actions.
   Forcing Ninja when building python on Windows.
 * PR614: Added support for sccache in github actions.

diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
@@ -16,8 +16,8 @@
   ws = 2:16;
   opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
-  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if (w==%d) {\n',w));
+      fwrite(fid,sprintf('  if constexpr (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if constexpr (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end

diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
@@ -6,6 +6,7 @@
 // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
 // common
 #define MAX_NSPREAD          16
+#define MIN_NSPREAD          2
 
 // max number of positive quadr nodes
 #define MAX_NQUAD            100

diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
@@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
 template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
 
-template<typename T>
-static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
+template<typename T, int ns>
+static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
    phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
    related to an asymptotic approximation to the Kaiser--Bessel, itself an
@@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
              : 0.0;
 }
 
-template<typename T>
-static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
-                                              const double upsampfac)
+template<typename T, int w>
+static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
@@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
   }
 }
 
-template<typename T>
-static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
-                                                  const T es_c, const T es_beta) {
+template<typename T, int w>
+static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c,
+                                                  const T es_beta) {
   for (int i = 0; i < w; i++) {
-    ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
+    ker[i] = evaluate_kernel<T, w>(abs(x + i), es_c, es_beta);
   }
 }
 
@@ -129,53 +128,53 @@ template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
 // Wrappers for methods of spreading
 template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 template<typename T>
 int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 
 template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
 template<typename T>
 int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
                                 cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
                                 cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
 template<typename T>
 int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
                             cufinufft_plan_t<T> *d_plan);
-template<typename T>
+template<typename T, int ns>
 int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                        int blksize);
 
 // Wrappers for methods of interpolation
-template<typename T>
+template<typename T, int ns>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize);
-template<typename T>
+template<typename T, int ns>
 int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                        int blksize);
 

diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -10,9 +10,12 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <type_traits>
+#include <utility> // for std::forward
 
 #include <thrust/extrema.h>
 
+#include <finufft_errors.h>
+
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
@@ -47,18 +50,6 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
   return int2{xstart, xend};
 }
 
-// Define a macro to check if NVCC version is >= 11.3
-#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
-#if (__CUDACC_VER_MAJOR__ > 11) || \
-    (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
-#define ALLOCA_SUPPORTED 1
-// windows compatibility
-#if __has_include(<malloc.h>)
-#include <malloc.h>
-#endif
-#endif
-#endif
-
 #if defined(__CUDA_ARCH__)
 #if __CUDA_ARCH__ >= 900
 #define COMPUTE_CAPABILITY_90_OR_HIGHER 1
@@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
   return std::make_tuple(nf, h, gam);
 }
 
+// Generalized dispatcher for any function requiring ns-based dispatch
+template<typename Func, typename T, int ns, typename... Args>
+int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  if constexpr (ns > MAX_NSPREAD) {
+    return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
+  } else {
+    if (target_ns == ns) {
+      return std::forward<Func>(func).template operator()<ns>(
+          std::forward<Args>(args)...);
+    }
+    return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
+                                        std::forward<Args>(args)...);
+  }
+}
+
+// Wrapper function that starts the dispatch recursion
+template<typename Func, typename T, typename... Args>
+int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
+  return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
+                                           std::forward<Args>(args)...);
+}
+
 } // namespace utils
 } // namespace cufinufft
 

diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
@@ -10,41 +10,46 @@
 namespace cufinufft {
 namespace spreadinterp {
 
-template<typename T>
-int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
-/*
-    A wrapper for different interpolation methods.
-
-    Methods available:
-    (1) Non-uniform points driven
-    (2) Subproblem
-
-    Melody Shih 11/21/21
-*/
-{
-  int nf1 = d_plan->nf1;
-  int M   = d_plan->M;
-
-  int ier;
-  switch (d_plan->opts.gpu_method) {
-  case 1: {
-    ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-  } break;
-  default:
-    std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
-    ier = FINUFFT_ERR_METHOD_NOTVALID;
+// Functor to handle function selection (nuptsdriven vs subprob)
+struct Interp1DDispatcher {
+  template<int ns, typename T>
+  int operator()(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) const {
+    switch (d_plan->opts.gpu_method) {
+    case 1:
+      return cuinterp1d_nuptsdriven<T, ns>(nf1, M, d_plan, blksize);
+    default:
+      std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n";
+      return FINUFFT_ERR_METHOD_NOTVALID;
+    }
   }
-
-  return ier;
+};
+
+// Updated cuinterp1d using generic dispatch
+template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize) {
+  /*
+   A wrapper for different interpolation methods.
+
+   Methods available:
+      (1) Non-uniform points driven
+      (2) Subproblem
+
+   Melody Shih 11/21/21
+
+   Now the function is updated to dispatch based on ns. This is to avoid alloca which
+   it seems slower according to the MRI community.
+   Marco Barbone 01/30/25
+  */
+  return launch_dispatch_ns<Interp1DDispatcher, T>(Interp1DDispatcher(),
+                                                   d_plan->spopts.nspread, d_plan->nf1,
+                                                   d_plan->M, d_plan, blksize);
 }
 
-template<typename T>
+template<typename T, int ns>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
   auto &stream = d_plan->stream;
   dim3 threadsPerBlock;
   dim3 blocks;
 
-  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
   T es_c          = d_plan->spopts.ES_c;
   T es_beta       = d_plan->spopts.ES_beta;
   T sigma         = d_plan->opts.upsampfac;
@@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
-      interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
-          d_idxnupts);
+      interp_1d_nuptsdriven<T, 1, ns><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
       RETURN_IF_CUDA_ERROR
     }
   } else {
     for (int t = 0; t < blksize; t++) {
-      interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
-          d_idxnupts);
+      interp_1d_nuptsdriven<T, 0, ns><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
       RETURN_IF_CUDA_ERROR
     }
   }