GPU type 3 in Python interface (#588)

* py: add basic type 3 interface for CUDA * cuda: fix stray printf * py: add type 3 testing for GPU * py: fix error msg for type 3 plan
flatironinstitute · Jan 24, 2025 · 622296e · 622296e
1 parent a6bf689
commit 622296e
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 14 deletions.
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
@@ -116,14 +116,14 @@ class NufftOpts(ctypes.Structure):
 
 _set_pts = lib.cufinufft_setpts
 _set_pts.argtypes = [
-    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_double_p,
-    c_double_p, c_double_p]
+    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int64, c_void_p,
+    c_void_p, c_void_p]
 _set_pts.restype = c_int
 
 _set_ptsf = lib.cufinufftf_setpts
 _set_ptsf.argtypes = [
-    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int, c_float_p,
-    c_float_p, c_float_p]
+    c_void_p, c_int64, c_void_p, c_void_p, c_void_p, ctypes.c_int64, c_void_p,
+    c_void_p, c_void_p]
 _set_ptsf.restype = c_int
 
 _exec_plan = lib.cufinufft_execute

diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
@@ -112,14 +112,25 @@ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
         else:
             raise TypeError("Expected complex64 or complex128.")
 
-        if isinstance(n_modes, numbers.Integral):
-            n_modes = (n_modes,)
-        elif isinstance(n_modes, collections.abc.Iterable):
-            n_modes = tuple(n_modes)
+        if nufft_type == 3:
+            if isinstance(n_modes, numbers.Integral):
+                dim = n_modes
+                n_modes = (1,) * dim     # Ignored, can be anything
+            else:
+                raise ValueError("For a type 3 plan, n_modes_or_dim must be a single number, the dimension")
         else:
-            raise ValueError(f"Invalid n_modes '{n_modes}'")
+            if isinstance(n_modes, numbers.Integral):
+                n_modes = (n_modes,)
+            elif isinstance(n_modes, collections.abc.Iterable):
+                n_modes = tuple(n_modes)
+            else:
+                raise ValueError(f"Invalid n_modes '{n_modes}'")
+            dim = len(n_modes)
+
+        if dim not in [1, 2, 3]:
+            raise ValueError("Only dimensions 1, 2, and 3 supported")
 
-        self.dim = len(n_modes)
+        self.dim = dim
         self.type = nufft_type
         self.isign = isign
         self.eps = float(eps)
@@ -218,6 +229,17 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
 
         M = _compat.get_array_size(_x)
 
+        if self.type == 3:
+            _s = _ensure_array_type(s, "s", self.real_dtype)
+            _t = _ensure_array_type(t, "t", self.real_dtype)
+            _u = _ensure_array_type(u, "u", self.real_dtype)
+
+            _s, _t, _u = _ensure_valid_pts(_s, _t, _u, self.dim)
+
+            N = _compat.get_array_size(_s)
+        else:
+            N = 0
+
         # Because FINUFFT/cufinufft are internally column major,
         #   we will reorder the pts axes. Reordering references
         #   save us from having to actually transpose signal data
@@ -235,15 +257,31 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
         if self.dim >= 2:
             fpts_axes.insert(0, _compat.get_array_ptr(_y))
             self._references.append(_y)
-
         if self.dim >= 3:
             fpts_axes.insert(0, _compat.get_array_ptr(_z))
             self._references.append(_z)
 
+        # Do the same for type 3
+        if self.type == 3:
+            fpts_axes_t3 = [_compat.get_array_ptr(_s), None, None]
+            self._references.append(_s)
+            if self.dim >= 2:
+                fpts_axes_t3.insert(0, _compat.get_array_ptr(_t))
+                self._references.append(_t)
+
+            if self.dim >= 3:
+                fpts_axes_t3.insert(0, _compat.get_array_ptr(_u))
+                self._references.append(_u)
+        else:
+            fpts_axes_t3 = [None] * 3
+
         # Then take three items off the stack as our reordered axis.
-        ier = self._setpts(self._plan, M, *fpts_axes[:3], 0, None, None, None)
+        ier = self._setpts(self._plan,
+                           M, *fpts_axes[:3],
+                           N, *fpts_axes_t3[:3])
 
         self.nj = M
+        self.nk = N
 
         if ier != 0:
             raise RuntimeError('Error setting non-uniform points.')
@@ -280,6 +318,9 @@ def execute(self, data, out=None):
         elif self.type == 2:
             req_data_shape = (self.n_trans, *self.n_modes)
             req_out_shape = (self.nj,)
+        elif self.type == 3:
+            req_data_shape = (self.n_trans, self.nj)
+            req_out_shape = (self.nk,)
 
         _data, data_shape = _ensure_array_shape(_data, "data", req_data_shape,
                                                 allow_reshape=True)
@@ -295,7 +336,7 @@ def execute(self, data, out=None):
         else:
             _out = _ensure_array_shape(_out, "out", req_out_shape)
 
-        if self.type == 1:
+        if self.type in [1, 3]:
             ier = self._exec_plan(self._plan, _compat.get_array_ptr(_data),
                     _compat.get_array_ptr(_out))
         elif self.type == 2:

diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
@@ -107,6 +107,44 @@ def _execute(*args, **kwargs):
         utils.verify_type2(k, fk, c, tol)
 
 
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dim", list(set(len(shape) for shape in SHAPES)))
+@pytest.mark.parametrize("n_source_pts", MS)
+@pytest.mark.parametrize("n_target_pts", MS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_type3(to_gpu, to_cpu, dtype, dim, n_source_pts, n_target_pts, output_arg):
+    if dtype == np.float32 and dim >= 2 and min(n_source_pts, n_target_pts) > 4000:
+        pytest.xfail("Garbage result for larger numbers of pts in single precision type 3")
+        # Strangely, this does not reproduce if we isolate the single case. To
+        # trigger it, we must run many other tests preceding this test case.
+        # So it's related to some global state of the library.
+
+    complex_dtype = utils._complex_dtype(dtype)
+
+    source_pts, source_coefs, target_pts = utils.type3_problem(complex_dtype,
+            dim, n_source_pts, n_target_pts)
+
+    plan = Plan(3, dim, dtype=complex_dtype)
+
+    source_pts_gpu = to_gpu(source_pts)
+    target_pts_gpu = to_gpu(target_pts)
+
+    source_coefs_gpu = to_gpu(source_coefs)
+
+    plan.setpts(*source_pts_gpu, *((None,) * (3 - dim)), *target_pts_gpu)
+
+    if not output_arg:
+        target_coefs_gpu = plan.execute(source_coefs_gpu)
+    else:
+        target_coefs_gpu = _compat.array_empty_like(source_coefs_gpu,
+                n_target_pts, dtype=complex_dtype)
+        plan.execute(source_coefs_gpu, out=target_coefs_gpu)
+
+    target_coefs = to_cpu(target_coefs_gpu)
+
+    utils.verify_type3(source_pts, source_coefs, target_pts, target_coefs, 1e-6)
+
+
 def test_opts(to_gpu, to_cpu, shape=(8, 8, 8), M=32, tol=1e-3):
     dtype = np.float32
 

diff --git a/python/cufinufft/tests/utils.py b/python/cufinufft/tests/utils.py
@@ -65,6 +65,16 @@ def type2_problem(dtype, shape, M, n_trans=()):
     return k, fk
 
 
+def type3_problem(dtype, dim, n_source_pts, n_target_pts, n_trans=()):
+    real_dtype = _real_dtype(dtype)
+
+    source_pts = gen_nu_pts(n_source_pts, dim=dim).astype(real_dtype)
+    source_coefs = gen_nonuniform_data(n_source_pts, n_trans=n_trans).astype(dtype)
+    target_pts = gen_nu_pts(n_target_pts, dim=dim).astype(real_dtype)
+
+    return source_pts, source_coefs, target_pts
+
+
 def make_grid(shape):
     dim = len(shape)
     shape = shape
@@ -97,6 +107,17 @@ def direct_type2(fk, k, dim):
     return c
 
 
+def direct_type3(source_pts, source_coefs, target_pts, ind):
+    target_pt = target_pts[:, ind[-1]]
+    target_pt = target_pt[:, np.newaxis]
+
+    _source_coef = source_coefs[ind[:-1]]
+
+    target_coef = np.sum(np.exp(1j * np.sum(target_pt * source_pts, axis=0)) * _source_coef)
+
+    return target_coef
+
+
 def verify_type1(k, c, fk, tol):
     dim = fk.ndim - (c.ndim - 1)
 
@@ -128,6 +149,25 @@ def verify_type2(k, fk, c, tol):
     assert type2_rel_err < 25 * tol
 
 
+def verify_type3(source_pts, source_coef, target_pts, target_coef, tol):
+    dim = source_pts.shape[0]
+
+    n_source_pts = source_pts.shape[-1]
+    n_target_pts = target_pts.shape[-1]
+    n_tr = source_coef.shape[:-1]
+
+    assert target_coef.shape == n_tr + (n_target_pts,)
+
+    ind = (int(0.1789 * n_target_pts),)
+
+    target_est = target_coef[ind]
+    target_true = direct_type3(source_pts, source_coef, target_pts, ind)
+
+    type3_rel_err = np.linalg.norm(target_est - target_true) / np.linalg.norm(target_true)
+
+    assert type3_rel_err < 100 * tol
+
+
 def transfer_funcs(module_name):
     if module_name == "pycuda":
         import pycuda.autoinit # NOQA:401

diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
@@ -144,7 +144,6 @@ int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   cuda_complex<T> *d_cstart;
   cuda_complex<T> *d_fkstart;
   const auto stream = d_plan->stream;
-  printf("[cufinufft] d_plan->ntransf = %d\n", d_plan->ntransf);
   for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
     int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
     d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;