Skip to content

Commit

Permalink
Removing alloca on GPU (#618)
Browse files Browse the repository at this point in the history
When building cuFINUFFT natively on your machine, you observed a performance regression but lower memory utilization. This was caused by the use of dynamic stack allocation to allocate memory for the kernel tensors. Previously, the approach was to allocate MAX_ARRAY on the stack. However, since the GPU stack is relatively small, this would spill over into global memory, leading to higher memory utilization.

In my benchmarks, I did not observe a performance regression when using alloca (dynamic stack allocation), which differs from your experience.

To achieve the best of both worlds, this PR  now avoidis alloca and instead using a template recursion trick to generate different CUDA kernels based on varying spreading width values. This approach eliminates the need for alloca while ensuring that no more memory is used than necessary.
  • Loading branch information
DiamonDinoia authored Feb 11, 2025
1 parent e217279 commit 73412b4
Show file tree
Hide file tree
Showing 16 changed files with 825 additions and 872 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).

Master, using release name V 2.4.0 (1/7/25)
* PR618: removing alloca and using kernel dispatch on the GPU.
* PR617: Caching pip dependencies in github actions.
Forcing Ninja when building python on Windows.
* PR614: Added support for sccache in github actions.
Expand Down
8 changes: 4 additions & 4 deletions devel/gen_all_horner_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
ws = 2:16;
opts.wpad = false; % pad kernel eval to multiple of 4

if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w');
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w');
end
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
Expand All @@ -27,9 +27,9 @@
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
if j==1 % write switch statement
fwrite(fid,sprintf(' if (w==%d) {\n',w));
fwrite(fid,sprintf(' if constexpr (w==%d) {\n',w));
else
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
fwrite(fid,sprintf(' } else if constexpr (w==%d) {\n',w));
end
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
end
Expand Down
370 changes: 185 additions & 185 deletions include/cufinufft/contrib/ker_horner_allw_loop.inc

Large diffs are not rendered by default.

302 changes: 151 additions & 151 deletions include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions include/cufinufft/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
// common
#define MAX_NSPREAD 16
#define MIN_NSPREAD 2

// max number of positive quadr nodes
#define MAX_NQUAD 100
Expand Down
41 changes: 20 additions & 21 deletions include/cufinufft/spreadinterp.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
template<typename T>
int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);

template<typename T>
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
template<typename T, int ns>
static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta)
/* ES ("exp sqrt") kernel evaluation at single real argument:
phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2
related to an asymptotic approximation to the Kaiser--Bessel, itself an
Expand All @@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
: 0.0;
}

template<typename T>
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
const double upsampfac)
template<typename T, int w>
static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac)
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
This is the current evaluation method, since it's faster (except i7 w=16).
Expand All @@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
}
}

template<typename T>
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
const T es_c, const T es_beta) {
template<typename T, int w>
static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c,
const T es_beta) {
for (int i = 0; i < w; i++) {
ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
ker[i] = evaluate_kernel<T, w>(abs(x + i), es_c, es_beta);
}
}

Expand All @@ -129,53 +128,53 @@ template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
// Wrappers for methods of spreading
template<typename T>
int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);

template<typename T>
int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
cufinufft_plan_t<T> *d_plan);
template<typename T>
template<typename T, int ns>
int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);

// Wrappers for methods of interpolation
template<typename T>
template<typename T, int ns>
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);
template<typename T>
template<typename T, int ns>
int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
int blksize);

Expand Down
37 changes: 25 additions & 12 deletions include/cufinufft/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <type_traits>
#include <utility> // for std::forward

#include <thrust/extrema.h>

#include <finufft_errors.h>

#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
Expand Down Expand Up @@ -47,18 +50,6 @@ template<typename T> __forceinline__ __device__ auto interval(const int ns, cons
return int2{xstart, xend};
}

// Define a macro to check if NVCC version is >= 11.3
#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
#if (__CUDACC_VER_MAJOR__ > 11) || \
(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
#define ALLOCA_SUPPORTED 1
// windows compatibility
#if __has_include(<malloc.h>)
#include <malloc.h>
#endif
#endif
#endif

#if defined(__CUDA_ARCH__)
#if __CUDA_ARCH__ >= 900
#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
Expand Down Expand Up @@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
return std::make_tuple(nf, h, gam);
}

// Generalized dispatcher for any function requiring ns-based dispatch
template<typename Func, typename T, int ns, typename... Args>
int dispatch_ns(Func &&func, int target_ns, Args &&...args) {
if constexpr (ns > MAX_NSPREAD) {
return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion
} else {
if (target_ns == ns) {
return std::forward<Func>(func).template operator()<ns>(
std::forward<Args>(args)...);
}
return dispatch_ns<Func, T, ns + 1>(std::forward<Func>(func), target_ns,
std::forward<Args>(args)...);
}
}

// Wrapper function that starts the dispatch recursion
template<typename Func, typename T, typename... Args>
int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) {
return dispatch_ns<Func, T, MIN_NSPREAD>(std::forward<Func>(func), target_ns,
std::forward<Args>(args)...);
}

} // namespace utils
} // namespace cufinufft

Expand Down
69 changes: 36 additions & 33 deletions src/cuda/1d/interp1d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,46 @@
namespace cufinufft {
namespace spreadinterp {

template<typename T>
int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
/*
A wrapper for different interpolation methods.
Methods available:
(1) Non-uniform points driven
(2) Subproblem
Melody Shih 11/21/21
*/
{
int nf1 = d_plan->nf1;
int M = d_plan->M;

int ier;
switch (d_plan->opts.gpu_method) {
case 1: {
ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
} break;
default:
std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
ier = FINUFFT_ERR_METHOD_NOTVALID;
// Functor to handle function selection (nuptsdriven vs subprob)
struct Interp1DDispatcher {
template<int ns, typename T>
int operator()(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) const {
switch (d_plan->opts.gpu_method) {
case 1:
return cuinterp1d_nuptsdriven<T, ns>(nf1, M, d_plan, blksize);
default:
std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n";
return FINUFFT_ERR_METHOD_NOTVALID;
}
}

return ier;
};

// Updated cuinterp1d using generic dispatch
template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize) {
/*
A wrapper for different interpolation methods.
Methods available:
(1) Non-uniform points driven
(2) Subproblem
Melody Shih 11/21/21
Now the function is updated to dispatch based on ns. This is to avoid alloca which
it seems slower according to the MRI community.
Marco Barbone 01/30/25
*/
return launch_dispatch_ns<Interp1DDispatcher, T>(Interp1DDispatcher(),
d_plan->spopts.nspread, d_plan->nf1,
d_plan->M, d_plan, blksize);
}

template<typename T>
template<typename T, int ns>
int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
auto &stream = d_plan->stream;
dim3 threadsPerBlock;
dim3 blocks;

int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
T es_c = d_plan->spopts.ES_c;
T es_beta = d_plan->spopts.ES_beta;
T sigma = d_plan->opts.upsampfac;
Expand All @@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks

if (d_plan->opts.gpu_kerevalmeth) {
for (int t = 0; t < blksize; t++) {
interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
d_idxnupts);
interp_1d_nuptsdriven<T, 1, ns><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
RETURN_IF_CUDA_ERROR
}
} else {
for (int t = 0; t < blksize; t++) {
interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
d_idxnupts);
interp_1d_nuptsdriven<T, 0, ns><<<blocks, threadsPerBlock, 0, stream>>>(
d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts);
RETURN_IF_CUDA_ERROR
}
}
Expand Down
Loading

0 comments on commit 73412b4

Please sign in to comment.