Skip to content

Commit

Permalink
got chaithya CPU spreadinterp working, added example; couple minor GP…
Browse files Browse the repository at this point in the history
…U type changes to keep
  • Loading branch information
ahbarnett committed Jan 8, 2025
1 parent 85e1c4e commit 2e490a1
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 29 deletions.
83 changes: 83 additions & 0 deletions examples/spreadinterponly1d.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// this is all you must include for the finufft lib...
#include <finufft.h>

// also used in this example...
#include <cassert>
#include <chrono>
#include <complex>
#include <cstdio>
#include <stdlib.h>
#include <vector>
using namespace std;
using namespace std::chrono;

int main(int argc, char *argv[])
/* Example of double-prec spread/interp only tasks, with basic math test (total mass,
ie testing the zero-frequency component). A better math test would be,
ironically, to use this spreader/interpolator as part of a NUFFT :)
Complex I/O arrays, but kernel is real. Barnett 1/8/25
See: spreadtestnd for demos of non-FINUFFT interface to spread/interp module.
Compile and run (static library case):
g++ spreadinterponly1d.cpp -I../include ../lib-static/libfinufft.a -o
spreadinterponly1d -lfftw3 -lfftw3_omp && ./spreadinterponly1d
*/
{
int M = 1e7; // number of nonuniform points
int N = 1e7; // size of regular grid
finufft_opts opts;
finufft_default_opts(&opts);
opts.spreadinterponly = 1; // task: the following two control kernel used...
double tol = 1e-9; // tolerance for (real) kernel shape design only
opts.upsampfac = 2.0; // pretend upsampling factor (really no upsampling)
// opts.spread_kerevalmeth = 0; // would be needed for nonstd upsampfacs

complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
vector<double> x(M); // input
vector<complex<double>> c(M); // input
vector<complex<double>> F(N); // output (spread to this array)

// first spread M=1 single unit-strength at the origin, to get its total mass...
x[0] = 0.0;
c[0] = 1.0;
int unused = 1;
int ier = finufft1d1(1, &x[0], &c[0], unused, tol, N, &F[0], &opts);
if (ier > 1) return ier;
complex<double> kersum = 0.0;
for (auto Fk : F) kersum += Fk; // kernel mass

// Now generate random nonuniform points (x) and complex strengths (c)...
for (int j = 0; j < M; ++j) {
x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
c[j] =
2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
}

opts.debug = 1;
auto t0 = steady_clock::now(); // now spread with all M pts... (dir=1)
ier = finufft1d1(M, &x[0], &c[0], unused, tol, N, &F[0], &opts);
double t = (steady_clock::now() - t0) / 1.0s;
if (ier > 1) return ier;
complex<double> csum = 0.0; // tot input strength
for (auto cj : c) csum += cj;
complex<double> mass = 0.0; // tot output mass
for (auto Fk : F) mass += Fk;
double relerr = abs(mass - kersum * csum) / abs(mass);
printf("1D spread-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, mass err %.3g\n",
t, M / t, ier, relerr);

for (auto &Fk : F) Fk = complex<double>{1.0, 0.0}; // unit grid input
opts.debug = 0;
t0 = steady_clock::now(); // now interp to all M pts... (dir=2)
ier = finufft1d2(M, &x[0], &c[0], unused, tol, N, &F[0], &opts);
t = (steady_clock::now() - t0) / 1.0s;
if (ier > 1) return ier;
csum = 0.0; // tot output
for (auto cj : c) csum += cj;
double maxerr = 0.0;
for (auto cj : c) maxerr = max(maxerr, abs(cj - kersum));
printf("1D interp-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, max err %.3g\n",
t, M / t, ier, maxerr / abs(kersum));
return 0;
}
7 changes: 3 additions & 4 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}
if (d_plan->opts.gpu_spreadinterponly) {
// XNOR implementation below with boolean logic.
if ((d_plan->opts.upsampfac != 1) == (type != 3)) {
if ((d_plan->opts.upsampfac != 1.0) == (type != 3)) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
Expand Down Expand Up @@ -197,7 +197,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
}


// dynamically request the maximum amount of shared memory available
// for the spreader

Expand Down Expand Up @@ -765,8 +764,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
d_plan->deconv, d_plan->deconv,
[c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
-> cuda_complex<T> {
const thrust::tuple<T, T, T> tuple,
cuda_complex<T> deconv) -> cuda_complex<T> {
// d2 and d3 are 0 if dim < 2 and dim < 3
const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
c2 * (thrust::get<1>(tuple) + d2) +
Expand Down
10 changes: 3 additions & 7 deletions src/cuda/cufinufft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ void cufinufft_default_opts(cufinufft_opts *opts)
The resulting struct may then be passed (instead of NULL) to the last
argument of cufinufft_plan().
Options with prefix "gpu_" are used for gpu code.
Notes:
1) Values set in this function for different type and dimensions are preferable
based on experiments. User can experiment with different settings by
Expand All @@ -112,17 +110,15 @@ void cufinufft_default_opts(cufinufft_opts *opts)
{
// sphinx tag (don't remove): @gpu_defopts_start
// data handling opts...
opts->modeord = 0;
opts->gpu_device_id = 0;

// diagnostic opts...
opts->modeord = 0;
opts->gpu_device_id = 0;
opts->gpu_spreadinterponly = 0;

// algorithm performance opts...
opts->gpu_method = 0;
opts->gpu_sort = 1;
opts->gpu_kerevalmeth = 1;
opts->upsampfac = 0;
opts->upsampfac = 0.0;
opts->gpu_maxsubprobsize = 1024;
opts->gpu_obinsizex = 0;
opts->gpu_obinsizey = 0;
Expand Down
50 changes: 33 additions & 17 deletions src/finufft_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,27 +652,42 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
// Note: batchSize not used since might be only 1.

spopts.spread_direction = type;
constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();

constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();
if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
for (int idim = 0; idim < dim; ++idim)
if (EPSILON * mstu[idim] > 1.0)
fprintf(stderr,
"%s warning: rounding err (due to cond # of prob) eps_mach*N%d = %.3g "
"> 1 !\n",
__func__, idim, (double)(EPSILON * mstu[idim]));
}
if (opts.spreadinterponly) { // (unusual case of no NUFFT, just report)

// determine fine grid sizes, sanity check them...
for (int idim = 0; idim < dim; ++idim) {
int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
if (nfier) throw nfier; // nf too big; we're done
}
// spreadinterp grid will simply be the user's "mode" grid...
for (int idim = 0; idim < dim; ++idim) nfdim[idim] = mstu[idim];

if (!opts.spreadinterponly) { // ..... eval Fourier series, alloc workspace .....
if (opts.debug) { // "long long" here is to avoid warnings with printf...
printf("[%s] %dd spreadinterponly(dir=%d): (ms,mt,mu)=(%lld,%lld,%lld)"
"\n ntrans=%d nthr=%d batchSize=%d kernel width ns=%d",
__func__, dim, type, (long long)mstu[0], (long long)mstu[1],
(long long)mstu[2], ntrans, nthr, batchSize, spopts.nspread);
if (batchSize == 1) // spread_thread has no effect in this case
printf("\n");
else
printf(" spread_thread=%d\n", opts.spread_thread);
}

for (int idim = 0; idim < dim; ++idim)
} else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....

if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
for (int idim = 0; idim < dim; ++idim)
if (EPSILON * mstu[idim] > 1.0)
fprintf(
stderr,
"%s warning: rounding err (due to cond # of prob) eps_mach*N%d = %.3g "
"> 1 !\n",
__func__, idim, (double)(EPSILON * mstu[idim]));
}

// determine fine grid sizes, sanity check, then alloc...
for (int idim = 0; idim < dim; ++idim) {
int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
if (nfier) throw nfier; // nf too big; we're done
phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
}

if (opts.debug) { // "long long" here is to avoid warnings with printf...
printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
Expand Down Expand Up @@ -1018,7 +1033,8 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
spreadinterpSortedBatch<TF>(thisBatchSize, this, fwBatch_or_fkb, cjb);
t_sprint += timer.elapsedsec();
}
// Release the fwBatch vector to prevent double freeing of memory (explain!)
// Release the fwBatch vector to prevent double freeing of memory (explain or
// delete!)
} // ........end b loop

if (opts.debug) { // report total times in their natural order...
Expand Down
2 changes: 1 addition & 1 deletion src/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2161,7 +2161,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
return FINUFFT_ERR_HORNER_WRONG_BETA;
}
if (upsampfac < 1.0) { // no digits would result
fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is < 1.0\n",
upsampfac);
return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
}
Expand Down

0 comments on commit 2e490a1

Please sign in to comment.