got chaithya CPU spreadinterp working, added example; couple minor GP…

…U type changes to keep
flatironinstitute · Jan 8, 2025 · 2e490a1 · 2e490a1
1 parent 85e1c4e
commit 2e490a1
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 29 deletions.
diff --git a/examples/spreadinterponly1d.cpp b/examples/spreadinterponly1d.cpp
@@ -0,0 +1,83 @@
+// this is all you must include for the finufft lib...
+#include <finufft.h>
+
+// also used in this example...
+#include <cassert>
+#include <chrono>
+#include <complex>
+#include <cstdio>
+#include <stdlib.h>
+#include <vector>
+using namespace std;
+using namespace std::chrono;
+
+int main(int argc, char *argv[])
+/* Example of double-prec spread/interp only tasks, with basic math test (total mass,
+   ie testing the zero-frequency component). A better math test would be,
+   ironically, to use this spreader/interpolator as part of a NUFFT :)
+   Complex I/O arrays, but kernel is real.  Barnett 1/8/25
+   See: spreadtestnd for demos of non-FINUFFT interface to spread/interp module.
+
+   Compile and run (static library case):
+
+   g++ spreadinterponly1d.cpp -I../include ../lib-static/libfinufft.a -o
+   spreadinterponly1d -lfftw3 -lfftw3_omp && ./spreadinterponly1d
+*/
+{
+  int M = 1e7; // number of nonuniform points
+  int N = 1e7; // size of regular grid
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.spreadinterponly = 1;    // task: the following two control kernel used...
+  double tol            = 1e-9; // tolerance for (real) kernel shape design only
+  opts.upsampfac        = 2.0;  // pretend upsampling factor (really no upsampling)
+  // opts.spread_kerevalmeth = 0;  // would be needed for nonstd upsampfacs
+
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+  vector<double> x(M);                           // input
+  vector<complex<double>> c(M);                  // input
+  vector<complex<double>> F(N);                  // output (spread to this array)
+
+  // first spread M=1 single unit-strength at the origin, to get its total mass...
+  x[0]       = 0.0;
+  c[0]       = 1.0;
+  int unused = 1;
+  int ier    = finufft1d1(1, &x[0], &c[0], unused, tol, N, &F[0], &opts);
+  if (ier > 1) return ier;
+  complex<double> kersum = 0.0;
+  for (auto Fk : F) kersum += Fk; // kernel mass
+
+  // Now generate random nonuniform points (x) and complex strengths (c)...
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
+  }
+
+  opts.debug = 1;
+  auto t0    = steady_clock::now(); // now spread with all M pts... (dir=1)
+  ier        = finufft1d1(M, &x[0], &c[0], unused, tol, N, &F[0], &opts);
+  double t   = (steady_clock::now() - t0) / 1.0s;
+  if (ier > 1) return ier;
+  complex<double> csum = 0.0; // tot input strength
+  for (auto cj : c) csum += cj;
+  complex<double> mass = 0.0; // tot output mass
+  for (auto Fk : F) mass += Fk;
+  double relerr = abs(mass - kersum * csum) / abs(mass);
+  printf("1D spread-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, mass err %.3g\n",
+         t, M / t, ier, relerr);
+
+  for (auto &Fk : F) Fk = complex<double>{1.0, 0.0}; // unit grid input
+  opts.debug = 0;
+  t0         = steady_clock::now(); // now interp to all M pts...  (dir=2)
+  ier        = finufft1d2(M, &x[0], &c[0], unused, tol, N, &F[0], &opts);
+  t          = (steady_clock::now() - t0) / 1.0s;
+  if (ier > 1) return ier;
+  csum = 0.0; // tot output
+  for (auto cj : c) csum += cj;
+  double maxerr = 0.0;
+  for (auto cj : c) maxerr = max(maxerr, abs(cj - kersum));
+  printf("1D interp-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, max err %.3g\n",
+         t, M / t, ier, maxerr / abs(kersum));
+  return 0;
+}
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -157,7 +157,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
   if (d_plan->opts.gpu_spreadinterponly) {
     // XNOR implementation below with boolean logic.
-    if ((d_plan->opts.upsampfac != 1) == (type != 3)) {
+    if ((d_plan->opts.upsampfac != 1.0) == (type != 3)) {
       ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
       goto finalize;
     }
@@ -197,7 +197,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
   }
 
-
   // dynamically request the maximum amount of shared memory available
   // for the spreader
 
@@ -765,8 +764,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
           thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
           d_plan->deconv, d_plan->deconv,
           [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
-              const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
-              -> cuda_complex<T> {
+              const thrust::tuple<T, T, T> tuple,
+              cuda_complex<T> deconv) -> cuda_complex<T> {
             // d2 and d3 are 0 if dim < 2 and dim < 3
             const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
                                c2 * (thrust::get<1>(tuple) + d2) +

diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
@@ -99,8 +99,6 @@ void cufinufft_default_opts(cufinufft_opts *opts)
     The resulting struct may then be passed (instead of NULL) to the last
     argument of cufinufft_plan().
 
-    Options with prefix "gpu_" are used for gpu code.
-
     Notes:
     1) Values set in this function for different type and dimensions are preferable
     based on experiments. User can experiment with different settings by
@@ -112,17 +110,15 @@ void cufinufft_default_opts(cufinufft_opts *opts)
 {
   // sphinx tag (don't remove): @gpu_defopts_start
   // data handling opts...
-  opts->modeord       = 0;
-  opts->gpu_device_id = 0;
-
-  // diagnostic opts...
+  opts->modeord              = 0;
+  opts->gpu_device_id        = 0;
   opts->gpu_spreadinterponly = 0;
 
   // algorithm performance opts...
   opts->gpu_method         = 0;
   opts->gpu_sort           = 1;
   opts->gpu_kerevalmeth    = 1;
-  opts->upsampfac          = 0;
+  opts->upsampfac          = 0.0;
   opts->gpu_maxsubprobsize = 1024;
   opts->gpu_obinsizex      = 0;
   opts->gpu_obinsizey      = 0;

diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
@@ -652,27 +652,42 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
                          // Note: batchSize not used since might be only 1.
 
     spopts.spread_direction = type;
+    constexpr TF EPSILON    = std::numeric_limits<TF>::epsilon();
 
-    constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();
-    if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
-      for (int idim = 0; idim < dim; ++idim)
-        if (EPSILON * mstu[idim] > 1.0)
-          fprintf(stderr,
-                  "%s warning: rounding err (due to cond # of prob) eps_mach*N%d = %.3g "
-                  "> 1 !\n",
-                  __func__, idim, (double)(EPSILON * mstu[idim]));
-    }
+    if (opts.spreadinterponly) { // (unusual case of no NUFFT, just report)
 
-    // determine fine grid sizes, sanity check them...
-    for (int idim = 0; idim < dim; ++idim) {
-      int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
-      if (nfier) throw nfier; // nf too big; we're done
-    }
+      // spreadinterp grid will simply be the user's "mode" grid...
+      for (int idim = 0; idim < dim; ++idim) nfdim[idim] = mstu[idim];
 
-    if (!opts.spreadinterponly) { // ..... eval Fourier series, alloc workspace .....
+      if (opts.debug) { // "long long" here is to avoid warnings with printf...
+        printf("[%s] %dd spreadinterponly(dir=%d): (ms,mt,mu)=(%lld,%lld,%lld)"
+               "\n               ntrans=%d nthr=%d batchSize=%d kernel width ns=%d",
+               __func__, dim, type, (long long)mstu[0], (long long)mstu[1],
+               (long long)mstu[2], ntrans, nthr, batchSize, spopts.nspread);
+        if (batchSize == 1) // spread_thread has no effect in this case
+          printf("\n");
+        else
+          printf(" spread_thread=%d\n", opts.spread_thread);
+      }
 
-      for (int idim = 0; idim < dim; ++idim)
+    } else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....
+
+      if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
+        for (int idim = 0; idim < dim; ++idim)
+          if (EPSILON * mstu[idim] > 1.0)
+            fprintf(
+                stderr,
+                "%s warning: rounding err (due to cond # of prob) eps_mach*N%d = %.3g "
+                "> 1 !\n",
+                __func__, idim, (double)(EPSILON * mstu[idim]));
+      }
+
+      // determine fine grid sizes, sanity check, then alloc...
+      for (int idim = 0; idim < dim; ++idim) {
+        int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
+        if (nfier) throw nfier;                   // nf too big; we're done
         phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
+      }
 
       if (opts.debug) { // "long long" here is to avoid warnings with printf...
         printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
@@ -1018,7 +1033,8 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
         spreadinterpSortedBatch<TF>(thisBatchSize, this, fwBatch_or_fkb, cjb);
         t_sprint += timer.elapsedsec();
       }
-      // Release the fwBatch vector to prevent double freeing of memory (explain!)
+      // Release the fwBatch vector to prevent double freeing of memory (explain or
+      // delete!)
     } // ........end b loop
 
     if (opts.debug) { // report total times in their natural order...

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -2161,7 +2161,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
     if (upsampfac < 1.0) { // no digits would result
-      fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
+      fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is < 1.0\n",
               upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }