Skip to content

Commit cfe3567

Browse files
committed
Optimize L2P for GPUs
1 parent 574f89f commit cfe3567

File tree

6 files changed

+343
-0
lines changed

6 files changed

+343
-0
lines changed

sumpy/expansion/local.py

+8
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,14 @@ def loopy_translate_from(self, src_expansion):
405405
f"A direct loopy kernel for translation from "
406406
f"{src_expansion} to {self} is not implemented.")
407407

408+
def loopy_evaluate(self, kernels):
409+
from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor,
410+
make_e2p_loopy_kernel)
411+
try:
412+
return make_l2p_loopy_kernel_for_volume_taylor(self, kernels)
413+
except NotImplementedError:
414+
return make_e2p_loopy_kernel(self, kernels)
415+
408416

409417
class VolumeTaylorLocalExpansion(
410418
VolumeTaylorExpansion,

sumpy/expansion/loopy.py

+329
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@
2222

2323
from typing import Sequence
2424
import pymbolic
25+
import pymbolic.primitives as prim
2526
import loopy as lp
2627
import numpy as np
2728
from sumpy.expansion import ExpansionBase
2829
from sumpy.kernel import Kernel
2930
import sumpy.symbolic as sym
3031
from sumpy.assignment_collection import SymbolicAssignmentCollection
3132
from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments
33+
from math import prod, gcd
3234

3335
import logging
3436
logger = logging.getLogger(__name__)
@@ -225,3 +227,330 @@ def make_p2e_loopy_kernel(
225227
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
226228

227229
return loopy_knl
230+
231+
232+
def make_l2p_loopy_kernel_for_volume_taylor(expansion, kernels):
233+
dim = expansion.dim
234+
order = expansion.order
235+
ncoeffs = len(expansion)
236+
237+
code_transformers = [expansion.get_code_transformer()] \
238+
+ [kernel.get_code_transformer() for kernel in kernels]
239+
pymbolic_conv = sym.SympyToPymbolicMapper()
240+
241+
max_deriv_order = 0
242+
sym_expr_dicts = []
243+
for kernel in kernels:
244+
expr_dict = {(0,)*dim: 1}
245+
expr_dict = kernel.get_derivative_coeff_dict_at_target(expr_dict)
246+
max_deriv_order = max(max_deriv_order, max(sum(mi) for mi in expr_dict))
247+
sym_expr_dict = {}
248+
for mi, coeff in expr_dict.items():
249+
coeff = pymbolic_conv(coeff)
250+
for transform in code_transformers:
251+
coeff = transform(coeff)
252+
sym_expr_dict[mi] = coeff
253+
sym_expr_dicts.append(sym_expr_dict)
254+
255+
domains = [
256+
"{[idim]: 0<=idim<dim}",
257+
"{[iorder0]: 0<iorder0<=order}",
258+
"{[zero_idx]: 0<=zero_idx<max_deriv_order}",
259+
"{[icoeff]: 0<=icoeff<ncoeffs}",
260+
]
261+
262+
powers = pymbolic.var("power_b")
263+
iorder = pymbolic.var("iorder0")
264+
idim = pymbolic.var("idim")
265+
result = pymbolic.var("result")
266+
b = pymbolic.var("b")
267+
center = pymbolic.var("center")
268+
target = pymbolic.var("target")
269+
rscale = pymbolic.var("rscale")
270+
coeffs = pymbolic.var("coeffs")
271+
icoeff = pymbolic.var("icoeff")
272+
zero_idx = pymbolic.var("zero_idx")
273+
temporary_variables = []
274+
275+
insns = [
276+
lp.Assignment(
277+
assignee=b[idim],
278+
expression=(target[idim] - center[idim])*(1/rscale),
279+
id="set_b",
280+
temp_var_type=lp.Optional(None),
281+
),
282+
# We need negative index access in the array to be zero
283+
# However loopy does not support negative indices, and we
284+
# have an offset of max_deriv_order for array access and
285+
# the first max_deriv_order values are set to zero.
286+
lp.Assignment(
287+
assignee=powers[idim, zero_idx],
288+
expression=0,
289+
id="zero_monomials",
290+
temp_var_type=lp.Optional(None),
291+
),
292+
lp.Assignment(
293+
assignee=powers[idim, max_deriv_order],
294+
expression=1,
295+
id="init_monomials",
296+
depends_on=frozenset(["zero_monomials"]),
297+
),
298+
lp.Assignment(
299+
assignee=powers[idim, max_deriv_order + iorder],
300+
expression=(
301+
powers[idim, max_deriv_order + iorder - 1]*b[idim]*(1/iorder)),
302+
id="update_monomials",
303+
depends_on=frozenset(["set_b", "init_monomials"]),
304+
),
305+
]
306+
307+
optimizations = [lambda knl: lp.tag_inames(knl, "e2p_iorder0:unr")]
308+
iorder = pymbolic.var("iorder1")
309+
wrangler = expansion.expansion_terms_wrangler
310+
311+
from sumpy.expansion import LinearPDEConformingVolumeTaylorExpansion
312+
if not isinstance(expansion, LinearPDEConformingVolumeTaylorExpansion):
313+
v = [pymbolic.var(f"x{i}") for i in range(dim)]
314+
domains += ["{[iorder1]: 0<=1iorder1<=order}"]
315+
upper_bound = "iorder1"
316+
for i in range(dim - 1, 0, -1):
317+
domains += [f"{{ [{v[i]}]: 0<={v[i]}<={upper_bound} }}"]
318+
upper_bound += f"-{v[i]}"
319+
domains += [f"{{ [{v[0]}]: {upper_bound}<={v[0]}<={upper_bound} }}"]
320+
idx = wrangler.get_storage_index(v, iorder)
321+
322+
for ikernel, expr_dict in enumerate(sym_expr_dicts):
323+
expr = sum(coeff * prod(powers[i,
324+
v[i] + max_deriv_order - mi[i]] for i in range(dim))
325+
* (1 / rscale ** sum(mi))
326+
for mi, coeff in expr_dict.items())
327+
328+
insn = lp.Assignment(
329+
assignee=result[ikernel],
330+
expression=(result[ikernel]
331+
+ coeffs[idx] * expr),
332+
id=f"write_{ikernel}",
333+
depends_on=frozenset(["update_monomials"]),
334+
)
335+
insns.append(insn)
336+
optimizations.append(lambda knl: lp.tag_inames(knl, "e2p_iorder1:unr"))
337+
else:
338+
coeffs_copy = pymbolic.var("coeffs_copy")
339+
insns.append(lp.Assignment(
340+
assignee=coeffs_copy[0, icoeff],
341+
expression=coeffs[icoeff],
342+
id="copy_coeffs",
343+
))
344+
# We need two rows for coeffs_copy since we cannot use inplace
345+
# updates due to parallel updates so we alternatively use
346+
# coeffs_copy[0, :] and coeffs_copy[1, :] to write and read from.
347+
temporary_variables.append(lp.TemporaryVariable(
348+
name="coeffs_copy",
349+
shape=(2, ncoeffs),
350+
))
351+
base_kernel = kernels[0].get_base_kernel()
352+
deriv_id_to_coeff, = base_kernel.get_pde_as_diff_op().eqs
353+
354+
ordering_key, axis_permutation = \
355+
wrangler._get_mi_ordering_key_and_axis_permutation()
356+
max_deriv_id = max(deriv_id_to_coeff, key=ordering_key)
357+
max_mi = max_deriv_id.mi
358+
359+
if all(m != 0 for m in max_mi):
360+
raise NotImplementedError("non-elliptic PDEs")
361+
362+
slowest_axis = axis_permutation[0]
363+
c = max_mi[slowest_axis]
364+
v = [pymbolic.var(f"x{i}") for i in range(dim)]
365+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
366+
x0 = v[0]
367+
368+
# sync_split is the maximum number of iterations in v[0] that we can do
369+
# before a synchronization is needed. For Laplace 2D there are two rows
370+
# of stored coeffs, and both of them can be calculated before a sync
371+
# is needed. For biharmonic 2D there are four rows in stored coeffs,
372+
# but synchronization needs to happen every two rows because calculating
373+
# the 6th row needs the 4th row synchronized
374+
sync_split = gcd(*[c - deriv_id.mi[slowest_axis]
375+
for deriv_id in deriv_id_to_coeff])
376+
377+
def get_domains(v, iorder, with_sync):
378+
domains = [f"{{ [{x0}_outer]: 0<={x0}_outer<={order//c} }}"]
379+
if with_sync:
380+
expr = f"{c//sync_split}*{x0}_sync_outer + {c}*{x0}_outer"
381+
domains += [f"{{ [{x0}_sync_outer]: 0<={expr}<={order} "
382+
f"and 0<={x0}_sync_outer<{c//sync_split} }}"]
383+
expr += f" + {v[0]}_inner"
384+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<={order} "
385+
f"and 0<={v[0]}_inner<{sync_split} }}"]
386+
else:
387+
expr = f"{v[0]}_inner + {c}*{x0}_outer"
388+
domains += [f"{{ [{v[0]}_inner]: 0<={expr}<={order} "
389+
f"and 0<={v[0]}_inner<{c} }}"]
390+
domains += [f"{{ [{v[0]}]: {expr}<={v[0]}<={expr} }}"]
391+
domains += [f"{{ [{iorder}]: {v[0]}<={iorder}<={order} }}"]
392+
upper_bound = f"{iorder}-{v[0]}"
393+
for i in range(dim - 1, 1, -1):
394+
domains += [f"{{ [{v[i]}]: 0<={v[i]}<={upper_bound} }}"]
395+
upper_bound += f"-{v[i]}"
396+
domains += [
397+
f"{{ [{v[1]}]: {upper_bound}<={v[1]}<={upper_bound} }}"]
398+
return domains
399+
400+
def get_idx(v):
401+
idx_sym = list(v)
402+
idx_sym[0] = v[0] % c
403+
idx = wrangler.get_storage_index(idx_sym)
404+
return idx
405+
406+
domains += get_domains(v, iorder, with_sync=True)
407+
idx = get_idx(v)
408+
409+
if c == sync_split:
410+
# We do not need to sync within the c rows.
411+
# Update the values from the c rows set coeffs_copy[p, :] from
412+
# the previous c rows set coeffs_copy[p-1, :]
413+
# and then read from coeffs_copy[p, :].
414+
# This code path is different to avoid an extra copy and
415+
# a synchronization step.
416+
prev_copy_idx = (v[0]//c - 1) % 2
417+
curr_copy_idx = (v[0]//c) % 2
418+
else:
419+
# We need to sync within the c rows.
420+
# Using the biharmonic 2D example:
421+
# - Update the rows 4, 5 at coeffs_copy[1, :] from values at
422+
# coeffs_copy[0, :]
423+
# - Synchronize
424+
# - Copy the rows 4, 5 from coeffs_copy[1, :] to coeffs_copy[0, :]
425+
# - Synchronize
426+
# - Update the rows 6, 7 at coeffs_copy[1, :] from values at
427+
# coeffs_copy[0, :]
428+
# - Synchronize
429+
# - Copy the rows 6, 7 from coeffs_copy[1, :] to coeffs_copy[0, :]
430+
# - Synchronize
431+
# - Read the rows 4, 5, 6, 7 from coeffs_copy[0, :]
432+
prev_copy_idx = 0
433+
curr_copy_idx = 1
434+
435+
max_mi_sym = [v[i] - max_mi[i] for i in range(dim)]
436+
scale = -1/deriv_id_to_coeff[max_deriv_id]
437+
expr = 0
438+
for deriv_id, pde_coeff in deriv_id_to_coeff.items():
439+
if deriv_id == max_deriv_id:
440+
continue
441+
mi_sym = [max_mi_sym[i] + deriv_id.mi[i] for i in range(dim)]
442+
mi_sym[0] = mi_sym[0] % c
443+
expr += (coeffs_copy[prev_copy_idx,
444+
wrangler.get_storage_index(mi_sym)]
445+
* (rscale**(sum(max_mi) - sum(deriv_id.mi))
446+
* pymbolic_conv(pde_coeff) * scale))
447+
448+
insns.append(lp.Assignment(
449+
assignee=coeffs_copy[curr_copy_idx, idx],
450+
expression=expr,
451+
id="update_coeffs",
452+
depends_on=frozenset(["copy_coeffs"]),
453+
depends_on_is_final=True,
454+
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
455+
))
456+
457+
if c != sync_split:
458+
# We now copy before synchronization
459+
v = [pymbolic.var(f"z{i}") for i in range(dim)]
460+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
461+
iorder = pymbolic.var("iorder3")
462+
idx = get_idx(v)
463+
domains += get_domains(v, iorder, with_sync=True)[2:]
464+
465+
insns.append(lp.Assignment(
466+
assignee=coeffs_copy[0, idx],
467+
expression=coeffs_copy[1, idx],
468+
id="copy_sync",
469+
depends_on=frozenset(["update_coeffs"]),
470+
depends_on_is_final=True,
471+
predicates=frozenset([prim.Comparison(v[0], ">=", c)]),
472+
))
473+
474+
v = [pymbolic.var(f"y{i}") for i in range(dim)]
475+
v[slowest_axis], v[0] = v[0], v[slowest_axis]
476+
iorder = pymbolic.var("iorder2")
477+
idx = get_idx(v)
478+
domains += get_domains(v, iorder, with_sync=False)[1:]
479+
480+
if c == sync_split:
481+
# We did not have to sync within the c rows.
482+
# We last wrote to coeffs_copy[v[0]//c % 2, :] and we read from it.
483+
fetch_idx = (v[0]//c) % 2
484+
else:
485+
# We need to sync within the c rows.
486+
# We last wrote to coeffs_copy[0, :] and we read from it.
487+
fetch_idx = 0
488+
489+
for ikernel, expr_dict in enumerate(sym_expr_dicts):
490+
expr = sum(coeff * prod(powers[i,
491+
v[i] + max_deriv_order - mi[i]] for i in range(dim))
492+
* (1 / rscale ** sum(mi))
493+
for mi, coeff in expr_dict.items())
494+
495+
insn = lp.Assignment(
496+
assignee=result[ikernel],
497+
expression=(result[ikernel]
498+
+ coeffs_copy[fetch_idx, idx] * expr),
499+
id=f"write_{ikernel}",
500+
depends_on=frozenset(["update_monomials",
501+
"update_coeffs" if c == sync_split else "copy_sync"]),
502+
depends_on_is_final=True,
503+
)
504+
insns.append(insn)
505+
506+
tags = {
507+
"e2p_iorder1": "l.0",
508+
f"e2p_{x0}_outer": "unr",
509+
f"e2p_{x0}_inner": "unr",
510+
f"e2p_{v[0]}_inner": "unr",
511+
"e2p_iorder2": "unr",
512+
}
513+
if c != sync_split:
514+
tags["e2p_iorder3"] = "l.0"
515+
516+
optimizations += [
517+
lambda knl: lp.tag_inames(knl, tags),
518+
lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy",
519+
lp.AddressSpace.LOCAL),
520+
lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"),
521+
]
522+
523+
target_args = gather_loopy_arguments((expansion,) + tuple(kernels))
524+
loopy_knl = lp.make_function(domains, insns,
525+
kernel_data=[
526+
lp.GlobalArg("result", shape=(len(kernels),), is_input=True,
527+
is_output=True),
528+
lp.GlobalArg("coeffs",
529+
shape=(ncoeffs,), is_input=True, is_output=False),
530+
lp.GlobalArg("center, target",
531+
shape=(dim,), is_input=True, is_output=False),
532+
lp.ValueArg("rscale", is_input=True),
533+
lp.ValueArg("itgt", is_input=True),
534+
lp.ValueArg("ntargets", is_input=True),
535+
lp.GlobalArg("targets",
536+
shape=(dim, "ntargets"), is_input=True, is_output=False),
537+
*target_args,
538+
*temporary_variables,
539+
...],
540+
name="e2p",
541+
lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
542+
fixed_parameters={
543+
"dim": dim,
544+
"nresults": len(kernels),
545+
"order": order,
546+
"max_deriv_order": max_deriv_order,
547+
"ncoeffs": ncoeffs,
548+
},
549+
)
550+
551+
loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
552+
553+
for kernel in kernels:
554+
loopy_knl = kernel.prepare_loopy_kernel(loopy_knl)
555+
556+
return loopy_knl, optimizations

sumpy/fmm.py

+2
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,7 @@ def eval_multipoles(self,
933933
result=pot,
934934

935935
rscale=self.level_to_rscale(isrc_level),
936+
max_ntargets_in_one_box=self.max_ntargets_in_one_box,
936937

937938
wait_for=wait_for,
938939

@@ -1069,6 +1070,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
10691070
result=pot,
10701071

10711072
rscale=self.level_to_rscale(lev),
1073+
max_ntargets_in_one_box=self.max_ntargets_in_one_box,
10721074

10731075
**kwargs)
10741076
events.append(evt)

sumpy/symbolic.py

+1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def _find_symbolic_backend():
118118
functions = sym.functions
119119
Number = sym.Number
120120
Float = sym.Float
121+
diff = sym.diff
121122

122123

123124
def _coeff_isneg(a):

sumpy/toys.py

+1
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def _e2p(psource, targets, e2p):
250250
target_boxes=boxes,
251251
box_target_starts=box_target_starts,
252252
box_target_counts_nonchild=box_target_counts_nonchild,
253+
max_ntargets_in_one_box=ntargets,
253254
centers=centers,
254255
rscale=psource.rscale,
255256
targets=targets,

0 commit comments

Comments
 (0)