Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature[next]: Temporary extraction heuristics #1341

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 90 additions & 27 deletions src/gt4py/next/iterator/transforms/global_tmps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import dataclasses
import functools
from collections.abc import Mapping
from typing import Any, Final, Iterable, Literal, Optional, Sequence
from typing import Any, Callable, Final, Iterable, Literal, Optional, Sequence

import gt4py.eve as eve
import gt4py.next as gtx
Expand Down Expand Up @@ -150,20 +150,54 @@ def canonicalize_applied_lift(closure_params: list[str], node: ir.FunCall) -> ir
return node


def temporary_extraction_predicate(expr: ir.Node, num_occurences: int) -> bool:
"""Determine if `expr` is an applied lift that should be extracted as a temporary."""
if not is_applied_lift(expr):
return False
# do not extract when the result is a list as we can not create temporaries for
# these stencils
if isinstance(expr.annex.type.dtype, type_inference.List):
return False
stencil = expr.fun.args[0] # type: ignore[attr-defined] # ensured by `is_applied_lift`
used_symbols = collect_symbol_refs(stencil)
# do not extract when the stencil is capturing
if used_symbols:
@dataclasses.dataclass(frozen=True)
class TemporaryExtractionPredicate:
"""
Construct a callable that determines if a lift expr can and should be extracted to a temporary.

The class optionally takes a heuristic that can restrict the extraction.
"""

heuristics: Optional[Callable[[ir.Expr], bool]] = None

def __call__(self, expr: ir.Expr, num_occurences: int) -> bool:
"""Determine if `expr` is an applied lift that should be extracted as a temporary."""
if not is_applied_lift(expr):
return False
# do not extract when the result is a list (i.e. a lift expression used in a `reduce` call)
# as we can not create temporaries for these stencils
if isinstance(expr.annex.type.dtype, type_inference.List):
return False
if self.heuristics and not self.heuristics(expr):
return False
stencil = expr.fun.args[0] # type: ignore[attr-defined] # ensured by `is_applied_lift`
# do not extract when the stencil is capturing
used_symbols = collect_symbol_refs(stencil)
if used_symbols:
return False
return True


@dataclasses.dataclass(frozen=True)
class SimpleTemporaryExtractionHeuristics:
"""
Heuristic that extracts only if a lift expr is derefed in more than one position.

Note that such expression result in redundant computations if inlined instead of being
placed into a temporary.
"""

closure: ir.StencilClosure

@functools.cached_property
def closure_shifts(self):
return trace_shifts.TraceShifts.apply(self.closure, inputs_only=False)

def __call__(self, expr: ir.Expr) -> bool:
shifts = self.closure_shifts[id(expr)]
if len(shifts) > 1:
return True
return False
return True


def _closure_parameter_argument_mapping(closure: ir.StencilClosure):
Expand Down Expand Up @@ -193,7 +227,14 @@ def _ensure_expr_does_not_capture(expr: ir.Expr, whitelist: list[ir.Sym]) -> Non
assert not (set(used_symbol_refs) - {param.id for param in whitelist})


def split_closures(node: ir.FencilDefinition, offset_provider) -> FencilWithTemporaries:
def split_closures(
node: ir.FencilDefinition,
offset_provider,
*,
extraction_heuristics: Optional[
Callable[[ir.StencilClosure], Callable[[ir.Expr], bool]]
] = None,
) -> FencilWithTemporaries:
"""Split closures on lifted function calls and introduce new temporary buffers for return values.

Newly introduced temporaries will have the symbolic size of `AUTO_DOMAIN`. A symbol with the
Expand All @@ -205,6 +246,13 @@ def split_closures(node: ir.FencilDefinition, offset_provider) -> FencilWithTemp
3. Extract lifted function class as new closures with the previously created temporary as output.
The closures are processed in reverse order to properly respect the dependencies.
"""
if not extraction_heuristics:
# extract all (eligible) lifts
def always_extract_heuristics(_):
return lambda _: True

extraction_heuristics = always_extract_heuristics
Comment on lines +249 to +254
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we put this as default argument?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That wouldn't be very handy. The heuristics is passed through from the backend configuration trough the pass manager, temporary extraction pass until it ends up here. By using None we can easily just specify None in the backend configuration and it gets passed through until it is here translated into the default heuristics.


uid_gen_tmps = UIDGenerator(prefix="_tmp")

type_inference.infer_all(node, offset_provider=offset_provider, save_to_annex=True)
Expand All @@ -228,9 +276,13 @@ def split_closures(node: ir.FencilDefinition, offset_provider) -> FencilWithTemp
current_closure.stencil if not is_scan else current_closure.stencil.args[0] # type: ignore[attr-defined] # ensured by is_scan
)

extraction_predicate = TemporaryExtractionPredicate(
extraction_heuristics(current_closure)
)

stencil_body, extracted_lifts, _ = extract_subexpression(
current_closure_stencil.expr,
temporary_extraction_predicate,
extraction_predicate,
uid_gen_tmps,
once_only=True,
deepest_expr_first=True,
Expand Down Expand Up @@ -454,7 +506,12 @@ def update_domains(
if closure.domain == AUTO_DOMAIN:
# every closure with auto domain should have a single out field
assert isinstance(closure.output, ir.SymRef)

if closure.output.id not in domains:
raise NotImplementedError(f"Closure output '{closure.output.id}' is never used.")

domain = domains[closure.output.id]

closure = ir.StencilClosure(
domain=copy.deepcopy(domain),
stencil=closure.stencil,
Expand All @@ -467,14 +524,6 @@ def update_domains(

closures.append(closure)

if closure.stencil == ir.SymRef(id="deref"):
# all closure inputs inherit the domain
for input_arg in _tuple_constituents(closure.inputs[0]):
assert isinstance(input_arg, ir.SymRef)
assert domains.get(input_arg.id, domain) == domain
domains[input_arg.id] = domain
continue

local_shifts = trace_shifts.TraceShifts.apply(closure)
for param, shift_chains in local_shifts.items():
assert isinstance(param, str)
Expand Down Expand Up @@ -512,13 +561,22 @@ def update_domains(
(axis, range_) if axis != old_axis else (new_axis, new_range)
for axis, range_ in consumed_domain.ranges.items()
)
# TODO(tehrengruber): Revisit. Somehow the order matters so preserve it.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for unstructured_domain(horizontal, vertical) the order matters, is that the point here?

Copy link
Contributor Author

@tehrengruber tehrengruber Feb 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing that it is more the TemporaryAllocation where the order matters, but I am not sure. The unstructured_domain is also using TaggedValues so it should be fine.

consumed_domain.ranges = dict(
(axis, range_) if axis != old_axis else (new_axis, new_range)
for axis, range_ in consumed_domain.ranges.items()
)
else:
raise NotImplementedError
raise NotImplementedError()
consumed_domains.append(consumed_domain)

# compute the bounds of all consumed domains
if consumed_domains:
domains[param] = domain_union(consumed_domains).as_expr()
if all(
consumed_domain.ranges.keys() == consumed_domains[0].ranges.keys()
for consumed_domain in consumed_domains
): # scalar otherwise
domains[param] = domain_union(consumed_domains).as_expr()

return FencilWithTemporaries(
fencil=ir.FencilDefinition(
Expand Down Expand Up @@ -597,10 +655,15 @@ def visit_FencilDefinition(
node: ir.FencilDefinition,
*,
offset_provider: Mapping[str, Any],
extraction_heuristics: Optional[
Callable[[ir.StencilClosure], Callable[[ir.Expr], bool]]
] = None,
symbolic_sizes: Optional[dict[str, str]],
) -> FencilWithTemporaries:
# Split closures on lifted function calls and introduce temporaries
res = split_closures(node, offset_provider=offset_provider)
res = split_closures(
node, offset_provider=offset_provider, extraction_heuristics=extraction_heuristics
)
# Prune unreferences closure inputs introduced in the previous step
res = PruneClosureInputs().visit(res)
# Prune unused temporaries possibly introduced in the previous step
Expand Down
50 changes: 35 additions & 15 deletions src/gt4py/next/iterator/transforms/pass_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# SPDX-License-Identifier: GPL-3.0-or-later

import enum
from typing import Optional
from typing import Callable, Optional

from gt4py.next.iterator import ir
from gt4py.next.iterator.transforms import simple_inline_heuristic
Expand Down Expand Up @@ -51,8 +51,6 @@ def _inline_lifts(ir, lift_mode):
return InlineLifts(
flags=InlineLifts.Flag.INLINE_TRIVIAL_DEREF_LIFT
| InlineLifts.Flag.INLINE_DEREF_LIFT # some tuple exprs found in FVM don't work yet.
| InlineLifts.Flag.INLINE_LIFTED_ARGS
# needed for UnrollReduce and lift args like `(↑(λ() → constant)`
).visit(ir)
else:
raise ValueError()
Expand All @@ -73,6 +71,8 @@ def _inline_into_scan(ir, *, max_iter=10):
return ir


# TODO(tehrengruber): Revisit interface to configure temporary extraction. We currently forward
# `lift_mode` and `temporary_extraction_heuristics` which is inconvenient.
def apply_common_transforms(
ir: ir.Node,
*,
Expand All @@ -82,6 +82,9 @@ def apply_common_transforms(
common_subexpression_elimination=True,
force_inline_lambda_args=False,
unconditionally_collapse_tuples=False,
temporary_extraction_heuristics: Optional[
Callable[[ir.StencilClosure], Callable[[ir.Expr], bool]]
] = None,
symbolic_domain_sizes: Optional[dict[str, str]] = None,
):
if lift_mode is None:
Expand Down Expand Up @@ -121,6 +124,33 @@ def apply_common_transforms(
else:
raise RuntimeError("Inlining 'lift' and 'lambdas' did not converge.")

if lift_mode != LiftMode.FORCE_INLINE:
assert offset_provider is not None
ir = CreateGlobalTmps().visit(
ir,
offset_provider=offset_provider,
extraction_heuristics=temporary_extraction_heuristics,
symbolic_sizes=symbolic_domain_sizes,
)

for _ in range(10):
inlined = InlineLifts().visit(ir)
inlined = InlineLambdas.apply(
inlined,
opcount_preserving=True,
force_inline_lift_args=True,
)
if inlined == ir:
break
ir = inlined
else:
raise RuntimeError("Inlining 'lift' and 'lambdas' did not converge.")

# If after creating temporaries, the scan is not at the top, we inline.
# The following example doesn't have a lift around the shift, i.e. temporary pass will not extract it.
# λ(inp) → scan(λ(state, k, kp) → state + ·k + ·kp, True, 0.0)(inp, ⟪Koffₒ, 1ₒ⟫(inp))`
ir = _inline_into_scan(ir)

# Since `CollapseTuple` relies on the type inference which does not support returning tuples
# larger than the number of closure outputs as given by the unconditional collapse, we can
# only run the unconditional version here instead of in the loop above.
Expand All @@ -134,6 +164,7 @@ def apply_common_transforms(

ir = FuseMaps().visit(ir)
ir = CollapseListGet().visit(ir)

if unroll_reduce:
for _ in range(10):
unrolled = UnrollReduce.apply(ir, offset_provider=offset_provider)
Expand All @@ -142,22 +173,11 @@ def apply_common_transforms(
ir = unrolled
ir = CollapseListGet().visit(ir)
ir = NormalizeShifts().visit(ir)
ir = _inline_lifts(ir, lift_mode)
ir = _inline_lifts(ir, LiftMode.FORCE_INLINE)
ir = NormalizeShifts().visit(ir)
else:
raise RuntimeError("Reduction unrolling failed.")

if lift_mode != LiftMode.FORCE_INLINE:
assert offset_provider is not None
ir = CreateGlobalTmps().visit(
ir, offset_provider=offset_provider, symbolic_sizes=symbolic_domain_sizes
)
ir = InlineLifts().visit(ir)
# If after creating temporaries, the scan is not at the top, we inline.
# The following example doesn't have a lift around the shift, i.e. temporary pass will not extract it.
# λ(inp) → scan(λ(state, k, kp) → state + ·k + ·kp, True, 0.0)(inp, ⟪Koffₒ, 1ₒ⟫(inp))`
ir = _inline_into_scan(ir)

ir = EtaReduction().visit(ir)
ir = ScanEtaReduction().visit(ir)

Expand Down
4 changes: 4 additions & 0 deletions src/gt4py/next/iterator/type_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import gt4py.next as gtx
from gt4py.next.common import Connectivity
from gt4py.next.iterator import ir
from gt4py.next.iterator.transforms.global_tmps import FencilWithTemporaries
from gt4py.next.type_inference import Type, TypeVar, freshen, reindex_vars, unify


Expand Down Expand Up @@ -936,6 +937,9 @@ def visit_StencilClosure(
)
return Closure(output=output, inputs=Tuple.from_elems(*inputs))

def visit_FencilWithTemporaries(self, node: FencilWithTemporaries, **kwargs):
return self.visit(node.fencil, **kwargs)

Comment on lines +940 to +942
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the pass does not return nodes again, but their type. As such the generic visit would try to create a FencilWithTemporaries where fencil has then been transformed into a type and then fail with e.g.

TypeError: 'FencilWithTemporaries.fencil' must be <class 'gt4py.next.iterator.ir.FencilDefinition'> (got 'FencilDefinitionType(name='__field_operator_testee', fundefs=EmptyTuple(), params=Tuple(front=TypeVar(idx=1305), others=Tuple(front=TypeVar(idx=1314), others=Tuple(front=TypeVar(idx=1323), others=Tuple(front=TypeVar(idx=1324), others=Tuple(front=TypeVar(idx=1325), others=Tuple(front=TypeVar(idx=1326), others=Tuple(front=TypeVar(idx=1327), others=Tuple(front=TypeVar(idx=1328), others=EmptyTuple())))))))))' which is a <class 'gt4py.next.iterator.type_inference.FencilDefinitionType'>).

def visit_FencilDefinition(
self,
node: ir.FencilDefinition,
Expand Down
10 changes: 7 additions & 3 deletions src/gt4py/next/program_processors/codegens/gtfn/gtfn_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import dataclasses
import functools
import warnings
from typing import Any, Final, Optional
from typing import Any, Callable, Final, Optional

import numpy as np

Expand Down Expand Up @@ -58,6 +58,9 @@ class GTFNTranslationStep(
lift_mode: Optional[LiftMode] = None
device_type: core_defs.DeviceType = core_defs.DeviceType.CPU
symbolic_domain_sizes: Optional[dict[str, str]] = None
temporary_extraction_heuristics: Optional[
Callable[[itir.StencilClosure], Callable[[itir.Expr], bool]]
] = None

def _default_language_settings(self) -> languages.LanguageWithHeaderFilesSettings:
match self.device_type:
Expand Down Expand Up @@ -179,14 +182,14 @@ def _preprocess_program(
self,
program: itir.FencilDefinition,
offset_provider: dict[str, Connectivity | Dimension],
runtime_lift_mode: Optional[LiftMode] = None,
runtime_lift_mode: Optional[LiftMode],
) -> itir.FencilDefinition:
# TODO(tehrengruber): Remove `lift_mode` from call interface. It has been implicitly added
# to the interface of all (or at least all of concern) backends, but instead should be
# configured in the backend itself (like it is here), until then we respect the argument
# here and warn the user if it differs from the one configured.
lift_mode = runtime_lift_mode or self.lift_mode
if lift_mode != self.lift_mode:
if runtime_lift_mode and runtime_lift_mode != self.lift_mode:
warnings.warn(
f"GTFN Backend was configured for LiftMode `{str(self.lift_mode)}`, but "
f"overriden to be {str(runtime_lift_mode)} at runtime."
Expand All @@ -202,6 +205,7 @@ def _preprocess_program(
# sid::composite (via hymap) supports assigning from tuple with more elements to tuple with fewer elements
unconditionally_collapse_tuples=True,
symbolic_domain_sizes=self.symbolic_domain_sizes,
temporary_extraction_heuristics=self.temporary_extraction_heuristics,
)

new_program = apply_common_transforms(
Expand Down
5 changes: 3 additions & 2 deletions src/gt4py/next/program_processors/runners/gtfn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import gt4py.next.allocators as next_allocators
from gt4py.eve.utils import content_hash
from gt4py.next import common
from gt4py.next.iterator.transforms import LiftMode
from gt4py.next.iterator.transforms import LiftMode, global_tmps
from gt4py.next.otf import languages, recipes, stages, step_types, workflow
from gt4py.next.otf.binding import nanobind
from gt4py.next.otf.compilation import cache, compiler
Expand Down Expand Up @@ -187,7 +187,8 @@ def compilation_hash(otf_closure: stages.ProgramCall) -> int:
name="run_gtfn_with_temporaries",
otf_workflow=gtfn_executor.otf_workflow.replace(
translation=gtfn_executor.otf_workflow.translation.replace(
lift_mode=LiftMode.FORCE_TEMPORARIES
lift_mode=LiftMode.FORCE_TEMPORARIES,
temporary_extraction_heuristics=global_tmps.SimpleTemporaryExtractionHeuristics,
),
),
),
Expand Down
Loading
Loading