From 60cb65a7c9d9eceae9f68d814f649efec9989965 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 29 May 2017 15:09:54 +0200 Subject: [PATCH 01/32] dle: Clean up (docs, useless stuff) --- devito/dle/backends/advanced.py | 22 ++++++++++------------ devito/dle/backends/basic.py | 8 ++++---- devito/dle/backends/common.py | 29 +++++++++-------------------- devito/dle/manipulation.py | 10 +++++++--- devito/nodes.py | 6 ++---- devito/tools.py | 21 --------------------- devito/visitors.py | 11 +++++++---- 7 files changed, 39 insertions(+), 68 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 93e06e8599..4f8e75901c 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -101,25 +101,23 @@ def _loop_blocking(self, state, **kwargs): """ Apply loop blocking to :class:`Iteration` trees. - By default, the blocked :class:`Iteration` objects and the block size are - determined heuristically. The heuristic consists of searching the deepest - Iteration/Expression tree and blocking all dimensions except: + Blocking is applied to parallel iteration trees. Heuristically, innermost + dimensions are not blocked to maximize the trip count of the SIMD loops. - * The innermost (eg, to retain SIMD vectorization); - * Those dimensions inducing loop-carried dependencies. - - The caller may take over the heuristic through ``kwargs['blocking']``, - a dictionary indicating the block size of each blocked dimension. For - example, for the :class:`Iteration` tree below: :: + Different heuristics may be specified via ``kwargs['blockshape']`` and + ``kwargs['blockinner']``. The former, a dictionary, is used to indicate + a specific block size for each blocked dimension. For example, for the + :class:`Iteration` tree: :: for i for j for k ... - one may pass in ``kwargs['blocking'] = {i: 4, j: 7}``, in which case the - two outer loops would be blocked, and the resulting 2-dimensional block - would be of size 4x7. + one may provide ``kwargs['blockshape'] = {i: 4, j: 7}``, in which case the + two outer loops will blocked, and the resulting 2-dimensional block will + have size 4x7. The latter may be set to True to also block innermost parallel + :class:`Iteration` objects. """ Region = namedtuple('Region', 'main leftover') diff --git a/devito/dle/backends/basic.py b/devito/dle/backends/basic.py index cae9717419..9d5c3dcdf8 100644 --- a/devito/dle/backends/basic.py +++ b/devito/dle/backends/basic.py @@ -22,10 +22,10 @@ def _pipeline(self, state): @dle_pass def _avoid_denormals(self, state, **kwargs): """ - Introduce nodes in the Iteration/Expression tree that will generate macros - to avoid computing with denormal numbers. These are normally flushed away - when using SSE-like instruction sets in a complete C program, but when - compiling shared objects specific instructions must instead be inserted. + Introduce nodes in the Iteration/Expression tree that will expand to C + macros telling the CPU to flush denormal numbers in hardware. Denormals + are normally flushed when using SSE-based instruction sets, except when + compiling shared objects. """ return {'nodes': (Denormals(),) + state.nodes, 'includes': ('xmmintrin.h', 'pmmintrin.h')} diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py index 1cdff58971..26b2e1036a 100644 --- a/devito/dle/backends/common.py +++ b/devito/dle/backends/common.py @@ -47,11 +47,6 @@ def update(self, nodes=None, elemental_functions=None, arguments=None, self.includes += as_tuple(includes) self.flags += as_tuple(flags) - @property - def has_applied_nontemporal_stores(self): - """True if nontemporal stores will be generated, False otherwise.""" - return 'ntstores' in self.flags - @property def has_applied_blocking(self): """True if loop blocking was applied, False otherwise.""" @@ -67,8 +62,6 @@ class Arg(object): """A DLE-produced argument.""" - from_Blocking = False - def __init__(self, argument, value): self.argument = argument self.value = value @@ -79,8 +72,6 @@ def __repr__(self): class BlockingArg(Arg): - from_Blocking = True - def __init__(self, blocked_dim, iteration, value): """ Represent an argument introduced in the kernel by Rewriter._loop_blocking. @@ -107,7 +98,7 @@ class AbstractRewriter(object): Transform Iteration/Expression trees to generate high performance C. This is just an abstract class. Actual transformers should implement the - abstract method ``_pipeline``, that is a sequence of AST transformations. + abstract method ``_pipeline``, which performs a sequence of AST transformations. """ __metaclass__ = abc.ABCMeta @@ -146,19 +137,17 @@ def _analyze(self, state): Analyze the Iteration/Expression trees in ``state.nodes`` to detect information useful to the subsequent DLE passes. - In particular, the presence of fully-parallel or "outermost-sequential - inner-parallel" (OSIP) :class:`Iteration` trees is tracked. In an OSIP - :class:`Iteration` tree, outermost :class:`Iteration` objects represent - an inherently sequential dimension, whereas all inner :class:`Iteration` - objects represent parallelizable dimensions. + In particular, fully-parallel or "outermost-sequential inner-parallel" + (OSIP) :class:`Iteration` trees are searched tracked. In an OSIP + :class:`Iteration` tree, the outermost :class:`Iteration` represents + a sequential dimension, whereas all inner :class:`Iteration` objects + represent parallel dimensions. """ - nodes = state.nodes sections = FindSections().visit(nodes) - trees = sections.keys() - candidate = max(trees, key=lambda i: len(i)) - candidates = [i for i in trees if len(i) == len(candidate)] + candidate = max(list(sections), key=lambda i: len(i)) + candidates = [i for i in sections if len(i) == len(candidate)] # The analysis below may return "false positives" (ie, absence of fully- # parallel or OSIP trees when this is actually false), but this should @@ -205,7 +194,7 @@ def _analyze(self, state): # Introduce the discovered properties in the Iteration/Expression tree for k, v in list(mapper.items()): args = k.args - # 'sequential' has obviously precedence over 'parallel' + # 'sequential' kills 'parallel' properties = ('sequential',) if 'sequential' in v else tuple(v) properties = as_tuple(args.pop('properties')) + properties mapper[k] = Iteration(properties=properties, **args) diff --git a/devito/dle/manipulation.py b/devito/dle/manipulation.py index 117e42e90c..6c804a1610 100644 --- a/devito/dle/manipulation.py +++ b/devito/dle/manipulation.py @@ -7,7 +7,9 @@ def compose_nodes(nodes, retrieve=False): - """Build an Iteration/Expression tree by nesting the nodes in ``nodes``.""" + """ + Build an Iteration/Expression tree by nesting the nodes in ``nodes``. + """ l = list(nodes) tree = [] @@ -25,10 +27,12 @@ def compose_nodes(nodes, retrieve=False): def copy_arrays(mapper, reverse=False): - """Build an Iteration/Expression tree performing the copy ``k = v``, or + """ + Build an Iteration/Expression tree performing the copy ``k = v``, or ``v = k`` if reverse=True, for each (k, v) in mapper. (k, v) are expected to be of type :class:`IndexedData`. The loop bounds are inferred from - the dimensions used in ``k``.""" + the dimensions used in ``k``. + """ if not mapper: return () diff --git a/devito/nodes.py b/devito/nodes.py index 21038d732b..d9cae9b126 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -236,11 +236,9 @@ def stencil(self): class Iteration(Node): - """Iteration object that encapsualtes a single loop over nodes, possibly - just SymPy expressions. + """Iteration object that encapsualtes a single loop over nodes. - :param nodes: Single or list of :class:`Node` objects that - define the loop body. + :param nodes: Single or list of :class:`Node` objects defining the loop body. :param dimension: :class:`Dimension` object over which to iterate. :param limits: Limits for the iteration space, either the loop size or a tuple of the form (start, finish, stepping). diff --git a/devito/tools.py b/devito/tools.py index 9295af0e59..b5ff9416f9 100644 --- a/devito/tools.py +++ b/devito/tools.py @@ -134,27 +134,6 @@ def convert_dtype_to_ctype(dtype): return conversion_dict[dtype] -def aligned(a, alignment=16): - """Function to align the memmory - - :param a: The given memory - :param alignment: Granularity of alignment, 16 bytes by default - :returns: Reference to the start of the aligned memory - """ - if (a.ctypes.data % alignment) == 0: - return a - - extra = alignment / a.itemsize - buf = np.empty(a.size + extra, dtype=a.dtype) - ofs = (-buf.ctypes.data % alignment) / a.itemsize - aa = buf[ofs:ofs+a.size].reshape(a.shape) - np.copyto(aa, a) - - assert (aa.ctypes.data % alignment) == 0 - - return aa - - def pprint(node, verbose=True): """ Shortcut to pretty print Iteration/Expression trees. diff --git a/devito/visitors.py b/devito/visitors.py index 406be6abd6..ff85a97722 100644 --- a/devito/visitors.py +++ b/devito/visitors.py @@ -363,8 +363,9 @@ def visit_Node(self, o, ret=None): class IsPerfectIteration(Visitor): - """Return True if an :class:`Iteration` defines a perfect loop nest, - False otherwise.""" + """ + Return True if an :class:`Iteration` defines a perfect loop nest, False otherwise. + """ def visit_object(self, o, **kwargs): return False @@ -385,7 +386,8 @@ def visit_Iteration(self, o, found=False, multi=False): class Transformer(Visitor): - """Given an Iteration/Expression tree T and a mapper from nodes in T to + """ + Given an Iteration/Expression tree T and a mapper from nodes in T to a set of new nodes L, M : N --> L, build a new Iteration/Expression tree T' where a node ``n`` in N is replaced with ``M[n]``. """ @@ -419,8 +421,9 @@ def visit(self, o, *args, **kwargs): class NestedTransformer(Transformer): + """ - As opposed to a :class:`Transformer`, a :class:`NestedTransforer` applies + Unlike a :class:`Transformer`, a :class:`NestedTransforer` applies replacements in a depth-first fashion. """ From 9298768e336185a45834b6e991d298543c596f7f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 29 May 2017 18:05:54 +0200 Subject: [PATCH 02/32] nodes: Add IterationFold --- devito/nodes.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/devito/nodes.py b/devito/nodes.py index d9cae9b126..a4f33569fd 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -15,8 +15,8 @@ from devito.stencil import Stencil from devito.tools import as_tuple, filter_ordered -__all__ = ['Node', 'Block', 'Expression', 'Function', 'Iteration', 'List', - 'TimedList'] +__all__ = ['Node', 'Block', 'Denormals', 'Expression', 'Function', 'FunCall', + 'Iteration', 'IterationFold', 'List', 'LocalExpression', 'TimedList'] class Node(object): @@ -24,6 +24,7 @@ class Node(object): is_Node = True is_Block = False is_Iteration = False + is_IterationFold = False is_Expression = False is_Function = False is_FunCall = False @@ -506,6 +507,75 @@ def children(self): return (self.body,) +# Special nodes useful for AST manipulation + +class IterationFold(Iteration): + + """ + An IterationFold is a sequence of :class:`Iteration` objects having same + dimension, limits and properties, but different offsets and bodies. In particular, + earlier :class:`Iteration` objects in the sequence have an iteration range + which is greater or equal than that of the later :class:`Iteration` objects. + + An IterationFold, however, acts as a plain :class:`Iteration`; in other words, + its folds are "hidden" when the object is visited. + + For example, an IterationFold could be used to represent the following sequence + of loops: :: + + for i = 1 to N-1; + ... + for i = 2 to N-2; + ... + for i = 4 to N-4: + ... + + In addition to the same :class:`Iteration` parameters, an :class:`IterationFold` + accepts the parameter ``folds``, an iterable of :class:`Iteration` objects from + which folds metadata is derived. In the example above, ``folds`` would be a list + of two items, the Iterations ``for i = 2 ..`` and ``for i = 4 ..``. + """ + + is_IterationFold = True + + def __init__(self, nodes, dimension, limits, index=None, offsets=None, + properties=None, folds=None): + super(IterationFold, self).__init__(nodes, dimension, limits, index, + offsets, properties) + self.folds = folds + + def __repr__(self): + properties = "" + if self.properties: + properties = "WithProperties[%s]::" % ",".join(self.properties) + length = "Length %d" % len(self.folds) + return "<%sIterationFold %s; %s; %s>" % (properties, self.index, + self.limits, length) + + @property + def ccode(self): + comment = c.Comment('This IterationFold is "hiding" ore or more Iterations') + code = super(IterationFold, self).ccode + return c.Module([comment, code]) + + def unfold(self): + """ + Return the corresponding :class:`Iteration` objects from each fold in ``self``. + """ + args = self.args + args.pop('folds') + + # Construct the root Iteration + root = Iteration(**args) + + # Construct the folds + args.pop('nodes') + args.pop('offsets') + folds = tuple(Iteration(nodes, offsets=ofs, **args) for ofs, nodes in self.folds) + + return as_tuple(root) + folds + + # Utilities class TimedList(List): From a7b46d39607727a11f60034c8c18516ae3ed86c6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 31 May 2017 11:17:41 +0200 Subject: [PATCH 03/32] visitors: Enhance Transformer (remove, extend) Also tiny fix to FindSections --- devito/exceptions.py | 4 ++++ devito/visitors.py | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/devito/exceptions.py b/devito/exceptions.py index abbe5e9ad0..3750ce99bc 100644 --- a/devito/exceptions.py +++ b/devito/exceptions.py @@ -14,6 +14,10 @@ class StencilOperationError(Exception): pass +class VisitorException(Exception): + pass + + class DSEException(Exception): pass diff --git a/devito/visitors.py b/devito/visitors.py index ff85a97722..673df1e8d8 100644 --- a/devito/visitors.py +++ b/devito/visitors.py @@ -7,13 +7,14 @@ from __future__ import absolute_import import inspect -from collections import OrderedDict, defaultdict +from collections import Iterable, OrderedDict, defaultdict from operator import attrgetter import cgen as c from sympy import Symbol from devito.dimension import LoweredDimension +from devito.exceptions import VisitorException from devito.nodes import Iteration, List, Node from devito.tools import as_tuple, filter_ordered, filter_sorted, flatten @@ -225,11 +226,15 @@ def default_retval(cls): """ def visit_tuple(self, o, ret=None, queue=None): + if ret is None: + ret = self.default_retval() for i in o: ret = self.visit(i, ret=ret, queue=queue) return ret def visit_Node(self, o, ret=None, queue=None): + if ret is None: + ret = self.default_retval() for i in o.children: ret = self.visit(i, ret=ret, queue=queue) return ret @@ -390,6 +395,11 @@ class Transformer(Visitor): Given an Iteration/Expression tree T and a mapper from nodes in T to a set of new nodes L, M : N --> L, build a new Iteration/Expression tree T' where a node ``n`` in N is replaced with ``M[n]``. + + In the special case in which ``M[n]`` is None, ``n`` is dropped from T'. + + In the special case in which ``M[n]`` is an iterable of nodes, ``n`` is + "extended" by pre-pending to its body the nodes in ``M[n]``. """ def __init__(self, mapper={}): @@ -401,14 +411,24 @@ def visit_object(self, o, **kwargs): return o def visit_tuple(self, o, **kwargs): - return tuple(self.visit(i, **kwargs) for i in o) + visited = tuple(self.visit(i, **kwargs) for i in o) + return tuple(i for i in visited if i is not None) visit_list = visit_tuple def visit_Node(self, o, **kwargs): if o in self.mapper: handle = self.mapper[o] - return handle._rebuild(**handle.args) + if handle is None: + # None -> drop /o/ + return None + elif isinstance(handle, Iterable): + if not o.children: + raise VisitorException + extended = (tuple(handle) + o.children[0],) + o.children[1:] + return o._rebuild(*extended, **o.args_frozen) + else: + return handle._rebuild(**handle.args) else: rebuilt = [self.visit(i, **kwargs) for i in o.children] return o._rebuild(*rebuilt, **o.args_frozen) @@ -430,7 +450,16 @@ class NestedTransformer(Transformer): def visit_Node(self, o, **kwargs): rebuilt = [self.visit(i, **kwargs) for i in o.children] handle = self.mapper.get(o, o) - return handle._rebuild(*rebuilt, **handle.args_frozen) + if handle is None: + # None -> drop /o/ + return None + elif isinstance(handle, Iterable): + if not o.children: + raise VisitorException + extended = [tuple(handle) + rebuilt[0]] + rebuilt[1:] + return o._rebuild(*extended, **o.args_frozen) + else: + return handle._rebuild(*rebuilt, **handle.args_frozen) class SubstituteExpression(Transformer): From 07148f004e154b7eda1e10fa4a8a46eb5a3fd8f9 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 31 May 2017 11:18:23 +0200 Subject: [PATCH 04/32] profiling: Refactoring and improvements --- devito/operator.py | 120 +++-------------------------- devito/profiling.py | 184 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 182 insertions(+), 122 deletions(-) diff --git a/devito/operator.py b/devito/operator.py index 8d342c3ee3..48f9de043a 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -1,9 +1,8 @@ from __future__ import absolute_import import operator -from collections import OrderedDict, namedtuple +from collections import OrderedDict from ctypes import c_int -from functools import reduce import cgen as c import numpy as np @@ -14,18 +13,15 @@ from devito.compiler import jit_compile, load from devito.dimension import Dimension, time from devito.dle import compose_nodes, filter_iterations, transform -from devito.dse import (clusterize, estimate_cost, estimate_memory, indexify, - rewrite, q_indexed) +from devito.dse import clusterize, indexify, rewrite, q_indexed from devito.interfaces import SymbolicData, Forward, Backward from devito.logger import bar, error, info -from devito.nodes import (Element, Expression, Function, Iteration, List, - LocalExpression, TimedList) +from devito.nodes import Element, Expression, Function, Iteration, List, LocalExpression from devito.parameters import configuration -from devito.profiling import Profiler +from devito.profiling import Profiler, create_profile from devito.stencil import Stencil from devito.tools import as_tuple, filter_ordered, flatten -from devito.visitors import (FindNodes, FindSections, FindSymbols, FindScopes, - IsPerfectIteration, ResolveIterationVariable, +from devito.visitors import (FindSymbols, FindScopes, ResolveIterationVariable, SubstituteExpression, Transformer) from devito.exceptions import InvalidArgument, InvalidOperator @@ -98,8 +94,7 @@ def __init__(self, expressions, **kwargs): nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure - self.sections = OrderedDict() - nodes = self._profile_sections(nodes) + nodes, self.profiler = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) @@ -315,13 +310,7 @@ def _extra_arguments(self): def _profile_sections(self, nodes): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" - return List(body=nodes) - - def _profile_summary(self, dim_sizes): - """ - Produce a summary of the performance achieved - """ - return PerformanceSummary() + return List(body=nodes), Profiler() def _autotune(self, arguments): """Use auto-tuning on this Operator to determine empirically the @@ -373,7 +362,7 @@ def _schedule_expressions(self, clusters, ordering): # No Iterations are needed processed.extend(expressions) - return processed + return List(body=processed) def _insert_declarations(self, dle_state, parameters): """Populate the Operator's body with the required array and @@ -489,10 +478,6 @@ class OperatorCore(OperatorBasic): C code evaluating stencil expressions, can also execute the computation. """ - def __init__(self, expressions, **kwargs): - self.profiler = Profiler() - super(OperatorCore, self).__init__(expressions, **kwargs) - def __call__(self, *args, **kwargs): self.apply(*args, **kwargs) @@ -505,7 +490,7 @@ def apply(self, *args, **kwargs): self.cfunction(*list(arguments.values())) # Output summary of performance achieved - summary = self._profile_summary(dim_sizes) + summary = self.profiler.summary(dim_sizes, self.dtype) with bar(): for k, v in summary.items(): name = '%s<%s>' % (k, ','.join('%d' % i for i in v.itershape)) @@ -526,61 +511,7 @@ def _arg_shape(self, argument): def _profile_sections(self, nodes): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" - mapper = {} - for node in nodes: - for itspace in FindSections().visit(node).keys(): - for i in itspace: - if IsPerfectIteration().visit(i): - # Insert `TimedList` block. This should come from - # the profiler, but we do this manually for now. - lname = 'loop_%s_%d' % (i.index, len(mapper)) - mapper[i] = TimedList(gname=self.profiler.varname, - lname=lname, body=i) - self.profiler.add(lname) - - # Estimate computational properties of the timed section - # (operational intensity, memory accesses) - expressions = FindNodes(Expression).visit(i) - ops = estimate_cost([e.expr for e in expressions]) - memory = estimate_memory([e.expr for e in expressions]) - self.sections[itspace] = Profile(lname, ops, memory) - break - processed = Transformer(mapper).visit(List(body=nodes)) - return processed - - def _profile_summary(self, dim_sizes): - """ - Produce a summary of the performance achieved - """ - summary = PerformanceSummary() - for itspace, profile in self.sections.items(): - dims = {i: i.dim.parent if i.dim.is_Buffered else i.dim for i in itspace} - - # Time - time = self.profiler.timings[profile.timer] - - # Flops - itershape = [i.extent(finish=dim_sizes.get(dims[i].name)) for i in itspace] - iterspace = reduce(operator.mul, itershape) - flops = float(profile.ops*iterspace) - gflops = flops/10**9 - - # Compulsory traffic - datashape = [i.dim.size or dim_sizes[dims[i].name] for i in itspace] - dataspace = reduce(operator.mul, datashape) - traffic = profile.memory*dataspace*self.dtype().itemsize - - # Derived metrics - oi = flops/traffic - gflopss = gflops/time - - # Keep track of performance achieved - summary.setsection(profile.timer, time, gflopss, oi, itershape, datashape) - - # Rename the most time consuming section as 'main' - summary['main'] = summary.pop(max(summary, key=summary.get)) - - return summary + return create_profile(nodes) def _extra_arguments(self): return OrderedDict([(self.profiler.typename, self.profiler.setup())]) @@ -618,34 +549,6 @@ def __new__(cls, *args, **kwargs): return obj -# Helpers for performance tracking - -PerfEntry = namedtuple('PerfEntry', 'time gflopss oi itershape datashape') -"""A helper to return structured performance data.""" - - -class PerformanceSummary(OrderedDict): - - """ - A special dictionary to track and view performance data. - """ - - def setsection(self, key, time, gflopss, oi, itershape, datashape): - self[key] = PerfEntry(time, gflopss, oi, itershape, datashape) - - @property - def gflopss(self): - return OrderedDict([(k, v.gflopss) for k, v in self.items()]) - - @property - def oi(self): - return OrderedDict([(k, v.oi) for k, v in self.items()]) - - @property - def timings(self): - return OrderedDict([(k, v.time) for k, v in self.items()]) - - # Misc helpers cnames = { @@ -654,9 +557,6 @@ def timings(self): } """A dict of standard names to be used for code generation.""" -Profile = namedtuple('Profile', 'timer ops memory') -"""A helper to track profiled sections of code.""" - def set_dle_mode(mode): """ diff --git a/devito/profiling.py b/devito/profiling.py index 487c7b725b..214406f29c 100644 --- a/devito/profiling.py +++ b/devito/profiling.py @@ -1,6 +1,89 @@ +from __future__ import absolute_import + +import operator +from collections import OrderedDict, namedtuple +from functools import reduce + from ctypes import Structure, byref, c_double from cgen import Struct, Value +from devito.dse import estimate_cost, estimate_memory +from devito.nodes import Expression, TimedList +from devito.visitors import IsPerfectIteration, FindSections, FindNodes, Transformer + +__all__ = ['Profile', 'create_profile'] + + +def create_profile(node): + """ + Create a :class:`Profiler` for the Iteration/Expression tree ``node``. + The following code sections are profiled: :: + + * The whole ``node``; + * A sequence of perfectly nested loops that have common :class:`Iteration` + dimensions, but possibly different extent. For example: :: + + for x = 0 to N + .. + for x = 1 to N-1 + .. + + Both Iterations have dimension ``x``, and will be profiled as a single + section, though their extent is different. + * Any perfectly nested loops. + """ + profiler = Profiler() + + # Group by root Iteration + mapper = OrderedDict() + for itspace in FindSections().visit(node): + mapper.setdefault(itspace[0], []).append(itspace) + + # Group sections if their iteration spaces overlap + key = lambda itspace: set([i.dim for i in itspace]) + found = [] + for v in mapper.values(): + queue = list(v) + handle = [] + while queue: + item = queue.pop(0) + if not handle or key(item) == key(handle[0]): + handle.append(item) + else: + # Found a timing section + found.append(tuple(handle)) + handle = [item] + if handle: + found.append(tuple(handle)) + + # Create and track C-level timers + mapper = OrderedDict() + for i, group in enumerate(found): + name = 'section_%d' % i + section, remainder = group[0], group[1:] + + index = len(section) > 1 and not IsPerfectIteration().visit(section[0]) + root = section[index] + + # Prepare to transform the Iteration/Expression tree + body = tuple(j[index] for j in group) + mapper[root] = TimedList(gname=profiler.varname, lname=name, body=body) + for j in remainder: + mapper[j[index]] = None + + # Estimate computational properties of the profiled section + expressions = FindNodes(Expression).visit(body) + ops = estimate_cost([e.expr for e in expressions]) + memory = estimate_memory([e.expr for e in expressions]) + + # Keep track of the new profiled section + profiler.add(name, section, ops, memory) + + # Transform the Iteration/Expression tree introducing the C-level timers + processed = Transformer(mapper).visit(node) + + return processed, profiler + class Profiler(object): @@ -12,22 +95,20 @@ class Profiler(object): typename = "profiler" def __init__(self): - self._timers = [] + # To be populated as new sections are tracked + self._sections = OrderedDict() self._C_timings = None - def add(self, name): + def add(self, name, section, ops, memory): """ - Add a C-level timer. - """ - self._timers.append(name) + Add a profiling section. - @property - def ctype(self): + :param name: The name which uniquely identifies the profiled code section. + :param section: The code section, represented as a tuple of :class:`Iteration`s. + :param ops: The number of floating-point operations in the section. + :param memory: The memory traffic in the section, as bytes moved from/to memory. """ - Returns a :class:`cgen.Struct` relative to the profiler. - """ - return Struct(Profiler.typename, - [Value('double', n) for n in self._timers]) + self._sections[section] = Profile(name, ops, memory) def setup(self): """ @@ -35,10 +116,51 @@ def setup(self): all timers added to ``self`` through ``self.add(...)``. """ cls = type("Timings", (Structure,), - {"_fields_": [(n, c_double) for n in self._timers]}) + {"_fields_": [(i.name, c_double) for i in self._sections.values()]}) self._C_timings = cls() return byref(self._C_timings) + def summary(self, dim_sizes, dtype): + """ + Return a summary of the performance numbers measured. + + :param dim_sizes: The run-time extent of each :class:`Iteration` tracked + by this Profiler. Used to compute the operational intensity + and the perfomance achieved in GFlops/s. + :param dtype: The data type of the objects in the profiled sections. Used + to compute the operational intensity. + """ + + summary = PerformanceSummary() + for itspace, profile in self._sections.items(): + dims = {i: i.dim.parent if i.dim.is_Buffered else i.dim for i in itspace} + + # Time + time = self.timings[profile.name] + + # Flops + itershape = [i.extent(finish=dim_sizes.get(dims[i].name)) for i in itspace] + iterspace = reduce(operator.mul, itershape) + flops = float(profile.ops*iterspace) + gflops = flops/10**9 + + # Compulsory traffic + datashape = [i.dim.size or dim_sizes[dims[i].name] for i in itspace] + dataspace = reduce(operator.mul, datashape) + traffic = profile.memory*dataspace*dtype().itemsize + + # Derived metrics + oi = flops/traffic + gflopss = gflops/time + + # Keep track of performance achieved + summary.setsection(profile.name, time, gflopss, oi, itershape, datashape) + + # Rename the most time consuming section as 'main' + summary['main'] = summary.pop(max(summary, key=summary.get)) + + return summary + @property def timings(self): """ @@ -48,3 +170,41 @@ def timings(self): raise RuntimeError("Cannot extract timings with non-finalized Profiler.") return {field: max(getattr(self._C_timings, field), 10**-6) for field, _ in self._C_timings._fields_} + + @property + def ctype(self): + """ + Returns a :class:`cgen.Struct` relative to the profiler. + """ + return Struct(Profiler.typename, + [Value('double', i.name) for i in self._sections.values()]) + + +class PerformanceSummary(OrderedDict): + + """ + A special dictionary to track and quickly access performance data. + """ + + def setsection(self, key, time, gflopss, oi, itershape, datashape): + self[key] = PerfEntry(time, gflopss, oi, itershape, datashape) + + @property + def gflopss(self): + return OrderedDict([(k, v.gflopss) for k, v in self.items()]) + + @property + def oi(self): + return OrderedDict([(k, v.oi) for k, v in self.items()]) + + @property + def timings(self): + return OrderedDict([(k, v.time) for k, v in self.items()]) + + +Profile = namedtuple('Profile', 'name ops memory') +"""Metadata for a profiled code section.""" + + +PerfEntry = namedtuple('PerfEntry', 'time gflopss oi itershape datashape') +"""Structured performance data.""" From 72cd2745d5b9669938c83a4f0b56782a578267e1 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 1 Jun 2017 12:16:35 +0200 Subject: [PATCH 05/32] visitors: Add FindAdjacentIterations --- devito/visitors.py | 52 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/devito/visitors.py b/devito/visitors.py index 673df1e8d8..0732a85b68 100644 --- a/devito/visitors.py +++ b/devito/visitors.py @@ -21,7 +21,8 @@ __all__ = ['FindNodes', 'FindSections', 'FindSymbols', 'FindScopes', 'IsPerfectIteration', 'SubstituteExpression', 'printAST', - 'ResolveIterationVariable', 'Transformer', 'NestedTransformer'] + 'ResolveIterationVariable', 'Transformer', 'NestedTransformer', + 'FindAdjacentIterations'] class Visitor(object): @@ -366,6 +367,55 @@ def visit_Node(self, o, ret=None): return ret +class FindAdjacentIterations(Visitor): + + @classmethod + def default_retval(cls): + return OrderedDict([('seen_iteration', False)]) + + """ + Return a mapper from nodes N in an Expression/Iteration tree to sequences of + :class:`Iteration` objects I = [I_0, I_1, ...], where N is the direct ancestor of + the items in I and all items in I are adjacent nodes in the tree. + """ + + def handler(self, o, parent=None, ret=None): + if ret is None: + ret = self.default_retval() + if parent is None: + return ret + group = [] + for i in o: + ret = self.visit(i, parent=parent, ret=ret) + if ret['seen_iteration'] is True: + group.append(i) + else: + if len(group) > 1: + ret.setdefault(parent, []).append(tuple(group)) + # Reset the group, Iterations no longer adjacent + group = [] + # Potential leftover + if len(group) > 1: + ret.setdefault(parent, []).append(tuple(group)) + return ret + + def visit_object(self, o, parent=None, ret=None): + return ret + + def visit_tuple(self, o, parent=None, ret=None): + return self.handler(o, parent=parent, ret=ret) + + def visit_Node(self, o, parent=None, ret=None): + ret = self.handler(o.children, parent=o, ret=ret) + ret['seen_iteration'] = False + return ret + + def visit_Iteration(self, o, parent=None, ret=None): + ret = self.handler(o.children, parent=o, ret=ret) + ret['seen_iteration'] = True + return ret + + class IsPerfectIteration(Visitor): """ From e87565c859a90b6cbfd90f16511375752fd0e9dc Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 7 Jun 2017 15:42:12 +0200 Subject: [PATCH 06/32] nodes: Add view property --- devito/nodes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/devito/nodes.py b/devito/nodes.py index a4f33569fd..82bbb6e3b3 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -59,6 +59,14 @@ def ccode(self): """Generate C code.""" raise NotImplementedError() + @property + def view(self): + """ + Generate a representation of the Iteration/Expression tree rooted in ``self``. + """ + from devito.visitors import printAST + return printAST(self) + @property def children(self): """Return the traversable children.""" From 50639647e580b5e69966f382eb3d14e9dbe90f6f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 7 Jun 2017 15:42:37 +0200 Subject: [PATCH 07/32] dle: Enhance create_elemental_functions Capture redundant functions --- devito/dle/backends/basic.py | 42 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/devito/dle/backends/basic.py b/devito/dle/backends/basic.py index 9d5c3dcdf8..be419fcecc 100644 --- a/devito/dle/backends/basic.py +++ b/devito/dle/backends/basic.py @@ -33,39 +33,44 @@ def _avoid_denormals(self, state, **kwargs): @dle_pass def _create_elemental_functions(self, state, **kwargs): """ - Move :class:`Iteration` sub-trees to separate functions. + Extract :class:`Iteration` sub-trees and move them into :class:`Function`s. - By default, inner iteration trees are moved. To move different types of - :class:`Iteration`, one can provide a lambda function in ``kwargs['rule']``, - taking as input an iterable of :class:`Iteration` and returning an iterable - of :class:`Iteration` (eg, a subset, the whole iteration tree). + By default, only innermost Iteration objects containing more than + ``self.thresholds['elemental']`` operations are extracted. One can specify a + different extraction rule through the lambda function ``kwargs['rule']``, + which takes as input an iterable of :class:`Iteration`s and returns an + :class:`Iteration` node. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) - rule = kwargs.get('rule', lambda tree: tree[-1:]) + rule = kwargs.get('rule', lambda tree: tree[-1]) functions = [] processed = [] - for i, node in enumerate(state.nodes): + for node in state.nodes: mapper = {} - for j, tree in enumerate(retrieve_iteration_tree(node)): + for tree in retrieve_iteration_tree(node, mode='superset'): if len(tree) <= 1: continue + root = rule(tree) - name = "f_%d_%d" % (i, j) + # Has an identical body been encountered already? + view = root.view + if view in mapper: + mapper[view][1].append(root) + continue - candidate = rule(tree) - root = candidate[0] - expressions = FindNodes(Expression).visit(candidate) + name = "f_%d" % len(functions) # Heuristic: create elemental functions only if more than # self.thresholds['elemental_functions'] operations are present + expressions = FindNodes(Expression).visit(root) ops = estimate_cost([e.expr for e in expressions]) if ops < self.thresholds['elemental'] and not root.is_Elementizable: continue # Determine the elemental function's arguments ... - already_in_scope = [k.dim for k in candidate] - required = [k for k in FindSymbols(mode='free-symbols').visit(candidate) + already_in_scope = [k.dim for k in tree[tree.index(root):]] + required = [k for k in FindSymbols(mode='free-symbols').visit(root) if k not in already_in_scope] required += [as_symbol(k) for k in set(flatten(k.free_symbols for k in root.bounds_symbolic))] @@ -73,7 +78,7 @@ def _create_elemental_functions(self, state, **kwargs): args = [] seen = {e.output for e in expressions if e.is_scalar} - for d in FindSymbols('symbolics').visit(candidate): + for d in FindSymbols('symbolics').visit(root): # Add a necessary Symbolic object handle = "(float*) %s" if d.is_SymbolicFunction else "%s_vec" args.append((handle % d.name, d)) @@ -101,13 +106,16 @@ def _create_elemental_functions(self, state, **kwargs): # Track info to transform the main tree call, parameters = zip(*args) - mapper[root] = List(header=noinline, body=FunCall(name, call)) + mapper[view] = (List(header=noinline, body=FunCall(name, call)), [root]) # Produce the new function functions.append(Function(name, root, 'void', parameters, ('static',))) # Transform the main tree - processed.append(Transformer(mapper).visit(node)) + imapper = {} + for v, keys in mapper.values(): + imapper.update({k: v for k in keys}) + processed.append(Transformer(imapper).visit(node)) return {'nodes': processed, 'elemental_functions': functions} From 20f14d9d2af8bc82d938fbbbfae3c5866ae8b69f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 7 Jun 2017 15:43:29 +0200 Subject: [PATCH 08/32] dle: Enhance retrieve_iteration_tree --- devito/dle/inspection.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/devito/dle/inspection.py b/devito/dle/inspection.py index 95cd31eb66..d07688278a 100644 --- a/devito/dle/inspection.py +++ b/devito/dle/inspection.py @@ -3,7 +3,7 @@ __all__ = ['filter_iterations', 'retrieve_iteration_tree'] -def retrieve_iteration_tree(node): +def retrieve_iteration_tree(node, mode='normal'): """Return a list of all :class:`Iteration` sub-trees rooted in ``node``. For example, given the Iteration tree: @@ -19,9 +19,24 @@ def retrieve_iteration_tree(node): Return the list: :: [(Iteration i, Iteration j, Iteration k), (Iteration i, Iteration p)] + + :param node: The searched Iteration/Expression tree. + :param mode: Accepted values are 'normal' (default) and 'superset', in which + case iteration trees that are subset of larget iteration trees + are dropped. """ + assert mode in ('normal', 'superset') - return [i for i in FindSections().visit(node).keys() if i] + trees = [i for i in FindSections().visit(node) if i] + if mode == 'normal': + return trees + else: + match = [] + for i in trees: + if any(set(i).issubset(set(j)) for j in trees if i != j): + continue + match.append(i) + return match def filter_iterations(tree, key=lambda i: i, stop=lambda i: False): From 2fe02fd794489c510ee34d856544247b0c27747a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 1 Jun 2017 12:17:14 +0200 Subject: [PATCH 09/32] dle: Start IterationFold blocking [Unfinished] --- devito/dle/__init__.py | 1 + devito/dle/backends/advanced.py | 23 ++++-- devito/dle/blocking_utils.py | 140 ++++++++++++++++++++++++++++++++ devito/dle/inspection.py | 16 +++- devito/nodes.py | 71 +--------------- 5 files changed, 171 insertions(+), 80 deletions(-) create mode 100644 devito/dle/blocking_utils.py diff --git a/devito/dle/__init__.py b/devito/dle/__init__.py index e1e708e079..f4ee089bf0 100644 --- a/devito/dle/__init__.py +++ b/devito/dle/__init__.py @@ -1,4 +1,5 @@ from devito.dle.inspection import * # noqa from devito.dle.manipulation import * # noqa +from devito.dle.blocking_utils import * # noqa from devito.dle.transformer import * # noqa from devito.dle.backends import YaskGrid, init # noqa diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 4f8e75901c..794d8a5c2e 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -12,6 +12,7 @@ from devito.dimension import Dimension from devito.dle import (compose_nodes, copy_arrays, filter_iterations, + fold_blockable_tree, unfold_blocked_tree, retrieve_iteration_tree) from devito.dle.backends import (BasicRewriter, BlockingArg, dle_pass, omplang, simdinfo, get_simd_flag, get_simd_items) @@ -124,8 +125,11 @@ def _loop_blocking(self, state, **kwargs): blocked = OrderedDict() processed = [] for node in state.nodes: + # Make sure loop blocking will span as many Iterations as possible + fold = fold_blockable_tree(node) + mapper = {} - for tree in retrieve_iteration_tree(node): + for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if 'blockinner' not in self.params: @@ -155,8 +159,8 @@ def _loop_blocking(self, state, **kwargs): start = inter_block.dim finish = start + block_size properties = 'vector-dim' if i.is_Vectorizable else None - intra_block = Iteration([], i.dim, [start, finish, 1], i.index, - properties=as_tuple(properties)) + intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None, + properties=as_tuple(properties)) blocked_iterations.append((inter_block, intra_block)) @@ -165,16 +169,16 @@ def _loop_blocking(self, state, **kwargs): # non-blocked ("remainder") iterations. start = inter_block.limits[0] finish = inter_block.limits[1] - main = Iteration([], i.dim, [start, finish, 1], i.index, - properties=i.properties) + main = i._rebuild([], limits=[start, finish, 1], offsets=None, + properties=i.properties) # Build unitary-increment Iteration over the 'leftover' region: # again as above, this may be necessary when the dimension size # is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] - leftover = Iteration([], i.dim, [start, finish, 1], i.index, - properties=i.properties) + leftover = i._rebuild([], limits=[start, finish, 1], offsets=None, + properties=i.properties) regions[i] = Region(main, leftover) @@ -193,9 +197,10 @@ def _loop_blocking(self, state, **kwargs): # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_tree) - rebuilt = Transformer(mapper).visit(node) + rebuilt = Transformer(mapper).visit(fold) - processed.append(rebuilt) + # Finish unrolling any previously folded Iterations + processed.append(unfold_blocked_tree(rebuilt)) # All blocked dimensions if not blocked: diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py new file mode 100644 index 0000000000..df45f924f1 --- /dev/null +++ b/devito/dle/blocking_utils.py @@ -0,0 +1,140 @@ +import cgen as c + +from devito.dle import compose_nodes, is_foldable, retrieve_iteration_tree +from devito.nodes import Iteration, List +from devito.visitors import (FindAdjacentIterations, IsPerfectIteration, + NestedTransformer, Transformer) +from devito.tools import as_tuple + +__all__ = ['fold_blockable_tree', 'unfold_blocked_tree'] + + +def fold_blockable_tree(node): + """ + Create :class:`IterationFold`s from sequences of nested :class:`Iteration`. + """ + found = FindAdjacentIterations().visit(node) + found.pop('seen_iteration') + + mapper = {} + for k, v in found.items(): + for i in v: + # Check if the Iterations in /i/ are foldable or not + assert len(i) > 1 + if any(not IsPerfectIteration().visit(j) for j in i): + continue + trees = [retrieve_iteration_tree(j)[0] for j in i] + if any(len(trees[0]) != len(j) for j in trees): + continue + pairwise_folds = zip(*reversed(trees)) + if any(not is_foldable(j) for j in pairwise_folds): + continue + for j in pairwise_folds: + root, remainder = j[0], j[1:] + folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes) + for i in remainder] + mapper[root] = IterationFold(folds=folds, **root.args) + for k in remainder: + mapper[k] = None + + # Insert the IterationFolds in the Iteration/Expression tree + processed = NestedTransformer(mapper).visit(node) + + return processed + + +def unfold_blocked_tree(node): + """ + Unfold nested :class:`IterationFold`. + + Examples + ======== + Given a section of Iteration/Expression tree as below: :: + + for i = 1 to N-1 // folded + for j = 1 to N-1 // folded + foo1() + + Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: + + for i = 1 to N-1 + for j = 1 to N-1 + foo1() + for i = 2 to N-2 + for j = 2 to N-2 + foo2() + """ + # Search the unfolding candidates + candidates = [] + for tree in retrieve_iteration_tree(node): + handle = tuple(i for i in tree if i.is_IterationFold) + if handle: + # Sanity check + assert IsPerfectIteration().visit(handle[0]) + candidates.append(handle) + + # Perform unfolding + mapper = {} + for tree in candidates: + unfolded = zip(*[i.unfold() for i in tree]) + unfolded = [compose_nodes(i) for i in unfolded] + mapper[tree[0]] = List(body=unfolded) + + # Insert the unfolded Iterations in the Iteration/Expression tree + processed = Transformer(mapper).visit(node) + + return processed + + +class IterationFold(Iteration): + + """ + An IterationFold is a special :class:`Iteration` object that represents + a sequence of consecutive (in program order) Iterations. In an IterationFold, + all Iterations of the sequence but the so called ``root`` are "hidden"; that is, + they cannot be visited by an Iteration/Expression tree visitor. + + The Iterations in the sequence represented by the IterationFold all have same + dimension and properties. However, their extent is relative to that of the ``root``. + """ + + is_IterationFold = True + + def __init__(self, nodes, dimension, limits, index=None, offsets=None, + properties=None, folds=None): + super(IterationFold, self).__init__(nodes, dimension, limits, index, + offsets, properties) + self.folds = folds + + def __repr__(self): + properties = "" + if self.properties: + properties = "WithProperties[%s]::" % ",".join(self.properties) + length = "Length %d" % len(self.folds) + return "<%sIterationFold %s; %s; %s>" % (properties, self.index, + self.limits, length) + + @property + def ccode(self): + comment = c.Comment('This IterationFold is "hiding" ore or more Iterations') + code = super(IterationFold, self).ccode + return c.Module([comment, code]) + + def unfold(self): + """ + Return the corresponding :class:`Iteration` objects from each fold in ``self``. + """ + args = self.args + args.pop('folds') + + # Construct the root Iteration + root = Iteration(**args) + + # Construct the folds + args.pop('nodes') + args.pop('offsets') + start, end, incr = args.pop('limits') + folds = tuple(Iteration(nodes, limits=[start+ofs[0], end+ofs[1], incr], **args) + for ofs, nodes in self.folds) + + return folds + as_tuple(root) diff --git a/devito/dle/inspection.py b/devito/dle/inspection.py index d07688278a..535a60541e 100644 --- a/devito/dle/inspection.py +++ b/devito/dle/inspection.py @@ -1,6 +1,7 @@ from devito.visitors import FindSections +from devito.tools import as_tuple -__all__ = ['filter_iterations', 'retrieve_iteration_tree'] +__all__ = ['filter_iterations', 'retrieve_iteration_tree', 'is_foldable'] def retrieve_iteration_tree(node, mode='normal'): @@ -62,3 +63,16 @@ def filter_iterations(tree, key=lambda i: i, stop=lambda i: False): break return filtered + + +def is_foldable(nodes): + """ + Return True if the iterable ``nodes`` consists of foldable :class:`Iteration` + objects, False otherwise. + """ + nodes = as_tuple(nodes) + if len(nodes) <= 1 or any(not i.is_Iteration for i in nodes): + return False + main = nodes[0] + return all(i.dim == main.dim and i.limits == main.limits and i.index == main.index + and i.properties == main.properties for i in nodes) diff --git a/devito/nodes.py b/devito/nodes.py index 82bbb6e3b3..020c5f478b 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -16,7 +16,7 @@ from devito.tools import as_tuple, filter_ordered __all__ = ['Node', 'Block', 'Denormals', 'Expression', 'Function', 'FunCall', - 'Iteration', 'IterationFold', 'List', 'LocalExpression', 'TimedList'] + 'Iteration', 'List', 'LocalExpression', 'TimedList'] class Node(object): @@ -515,75 +515,6 @@ def children(self): return (self.body,) -# Special nodes useful for AST manipulation - -class IterationFold(Iteration): - - """ - An IterationFold is a sequence of :class:`Iteration` objects having same - dimension, limits and properties, but different offsets and bodies. In particular, - earlier :class:`Iteration` objects in the sequence have an iteration range - which is greater or equal than that of the later :class:`Iteration` objects. - - An IterationFold, however, acts as a plain :class:`Iteration`; in other words, - its folds are "hidden" when the object is visited. - - For example, an IterationFold could be used to represent the following sequence - of loops: :: - - for i = 1 to N-1; - ... - for i = 2 to N-2; - ... - for i = 4 to N-4: - ... - - In addition to the same :class:`Iteration` parameters, an :class:`IterationFold` - accepts the parameter ``folds``, an iterable of :class:`Iteration` objects from - which folds metadata is derived. In the example above, ``folds`` would be a list - of two items, the Iterations ``for i = 2 ..`` and ``for i = 4 ..``. - """ - - is_IterationFold = True - - def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None, folds=None): - super(IterationFold, self).__init__(nodes, dimension, limits, index, - offsets, properties) - self.folds = folds - - def __repr__(self): - properties = "" - if self.properties: - properties = "WithProperties[%s]::" % ",".join(self.properties) - length = "Length %d" % len(self.folds) - return "<%sIterationFold %s; %s; %s>" % (properties, self.index, - self.limits, length) - - @property - def ccode(self): - comment = c.Comment('This IterationFold is "hiding" ore or more Iterations') - code = super(IterationFold, self).ccode - return c.Module([comment, code]) - - def unfold(self): - """ - Return the corresponding :class:`Iteration` objects from each fold in ``self``. - """ - args = self.args - args.pop('folds') - - # Construct the root Iteration - root = Iteration(**args) - - # Construct the folds - args.pop('nodes') - args.pop('offsets') - folds = tuple(Iteration(nodes, offsets=ofs, **args) for ofs, nodes in self.folds) - - return as_tuple(root) + folds - - # Utilities class TimedList(List): From 66b524f4fbecf9c75fb03964059f6f076bc62c22 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 5 Jun 2017 16:18:37 +0200 Subject: [PATCH 10/32] dle: More IterationFold blocking [Unfinished] --- devito/dle/backends/advanced.py | 7 +-- devito/dle/blocking_utils.py | 92 +++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 794d8a5c2e..36cada9163 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -31,7 +31,7 @@ class DevitoRewriter(BasicRewriter): def _pipeline(self, state): self._avoid_denormals(state) self._loop_fission(state) - self._create_elemental_functions(state) + #self._create_elemental_functions(state) self._loop_blocking(state) self._simdize(state) if self.params['openmp'] is True: @@ -121,18 +121,19 @@ def _loop_blocking(self, state, **kwargs): :class:`Iteration` objects. """ Region = namedtuple('Region', 'main leftover') + exclude_innermost = 'blockinner' not in self.params blocked = OrderedDict() processed = [] for node in state.nodes: # Make sure loop blocking will span as many Iterations as possible - fold = fold_blockable_tree(node) + fold = fold_blockable_tree(node, exclude_innermost) mapper = {} for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] - if 'blockinner' not in self.params: + if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if not iterations: continue diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index df45f924f1..e43b1f2c92 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -1,15 +1,18 @@ import cgen as c +from sympy import Eq, Symbol +import numpy as np from devito.dle import compose_nodes, is_foldable, retrieve_iteration_tree -from devito.nodes import Iteration, List -from devito.visitors import (FindAdjacentIterations, IsPerfectIteration, +from devito.dse import retrieve_indexed +from devito.nodes import Expression, Iteration, List, LocalExpression +from devito.visitors import (FindAdjacentIterations, FindNodes, IsPerfectIteration, NestedTransformer, Transformer) -from devito.tools import as_tuple +from devito.tools import as_tuple, flatten __all__ = ['fold_blockable_tree', 'unfold_blocked_tree'] -def fold_blockable_tree(node): +def fold_blockable_tree(node, exclude_innermost=False): """ Create :class:`IterationFold`s from sequences of nested :class:`Iteration`. """ @@ -19,17 +22,26 @@ def fold_blockable_tree(node): mapper = {} for k, v in found.items(): for i in v: - # Check if the Iterations in /i/ are foldable or not + # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue + # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] - if any(len(trees[0]) != len(j) for j in trees): + handle = [] + for j in trees: + if len(j) != len(trees[0]): + break + handle.append(j) + trees = handle + if not trees: continue + # Check foldability pairwise_folds = zip(*reversed(trees)) if any(not is_foldable(j) for j in pairwise_folds): continue - for j in pairwise_folds: + # Perform folding + for j in pairwise_folds[:-exclude_innermost]: root, remainder = j[0], j[1:] folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] @@ -76,9 +88,10 @@ def unfold_blocked_tree(node): # Perform unfolding mapper = {} for tree in candidates: - unfolded = zip(*[i.unfold() for i in tree]) - unfolded = [compose_nodes(i) for i in unfolded] - mapper[tree[0]] = List(body=unfolded) + trees = zip(*[i.unfold() for i in tree]) + trees = optimize_unfolded_tree(trees[:-1], trees[-1]) + trees = [compose_nodes(i) for i in trees] + mapper[tree[0]] = List(body=trees) # Insert the unfolded Iterations in the Iteration/Expression tree processed = Transformer(mapper).visit(node) @@ -86,6 +99,65 @@ def unfold_blocked_tree(node): return processed +def optimize_unfolded_tree(unfolded, root): + """ + Transform folded trees to reduce the memory footprint. + + Examples + ======== + Given: + + .. code-block:: + for i = 1 to N - 1 # Folded tree + for j = 1 to N - 1 + tmp[i,j] = ... + for i = 2 to N - 2 # Root + for j = 2 to N - 2 + ... = ... tmp[i,j] ... + + The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the + iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the + ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting + iteration tree becomes: + + .. code-block:: + for i = 1 to i_bs + 1 # Folded tree + for j = 1 to j_bs + 1 + i' = i + i_block - 2 + j' = j + j_block - 2 + tmp[i,j] = ... # use i' and j' + for i = i_block to i_block + i_bs # Root + for j = j_block to j_block + j_bs + i' = i - x_block + j' = j - j_block + ... = ... tmp[i',j'] ... + """ + processed = [] + for i, tree in enumerate(unfolded): + otree = [] + stmts = [] + mapper = {} + + # "Shrink" the iteration space + for j in tree: + start, end, incr = j.args['limits'] + otree.append(j._rebuild(limits=[0, end-start, incr])) + index = Symbol('%ss%d' % (j.index, i)) + stmts.append(LocalExpression(Eq(index, j.dim + start), np.int32)) + mapper[j.dim] = index + + # Substitute iteration variables within the folded trees + expressions = FindNodes(Expression).visit(otree[-1]) + substituted = [i._rebuild(expr=Eq(i.expr.lhs, i.expr.rhs.xreplace(mapper))) + for i in expressions] + + handle = Transformer(dict(zip(expressions, substituted))).visit(otree[-1]) + handle = handle._rebuild(nodes=(stmts + list(handle.nodes))) + processed.append(tuple(otree[:-1]) + (handle,)) + + return processed + [root] + + class IterationFold(Iteration): """ From ae35b2a23dee56e9f270333eab72f083d9c8f1d3 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 7 Jun 2017 15:45:04 +0200 Subject: [PATCH 11/32] dle: Tiny fix to propagate properties when blocking --- devito/dle/backends/advanced.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 36cada9163..031d80b975 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -159,9 +159,7 @@ def _loop_blocking(self, state, **kwargs): # Build Iteration within a block start = inter_block.dim finish = start + block_size - properties = 'vector-dim' if i.is_Vectorizable else None - intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=as_tuple(properties)) + intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None) blocked_iterations.append((inter_block, intra_block)) @@ -170,16 +168,14 @@ def _loop_blocking(self, state, **kwargs): # non-blocked ("remainder") iterations. start = inter_block.limits[0] finish = inter_block.limits[1] - main = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=i.properties) + main = i._rebuild([], limits=[start, finish, 1], offsets=None) # Build unitary-increment Iteration over the 'leftover' region: # again as above, this may be necessary when the dimension size # is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] - leftover = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=i.properties) + leftover = i._rebuild([], limits=[start, finish, 1], offsets=None) regions[i] = Region(main, leftover) From d28866051c57e546b2dcfe8a89dedc6bf9a2375c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Sat, 10 Jun 2017 15:51:36 +0200 Subject: [PATCH 12/32] dle: Optimized IterationFold blocking [Unfinished] --- devito/dle/blocking_utils.py | 28 +++++++++++++++++++++------- devito/dse/manipulation.py | 20 +++++++++++++++++--- devito/interfaces.py | 10 +++++++--- devito/operator.py | 3 +++ 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index e43b1f2c92..fdb635694d 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -3,7 +3,7 @@ import numpy as np from devito.dle import compose_nodes, is_foldable, retrieve_iteration_tree -from devito.dse import retrieve_indexed +from devito.dse import xreplace_indices from devito.nodes import Expression, Iteration, List, LocalExpression from devito.visitors import (FindAdjacentIterations, FindNodes, IsPerfectIteration, NestedTransformer, Transformer) @@ -143,18 +143,32 @@ def optimize_unfolded_tree(unfolded, root): start, end, incr = j.args['limits'] otree.append(j._rebuild(limits=[0, end-start, incr])) index = Symbol('%ss%d' % (j.index, i)) - stmts.append(LocalExpression(Eq(index, j.dim + start), np.int32)) + stmts.append((LocalExpression(Eq(index, j.dim + start), np.int32), + LocalExpression(Eq(index, j.dim - start), np.int32))) mapper[j.dim] = index # Substitute iteration variables within the folded trees - expressions = FindNodes(Expression).visit(otree[-1]) - substituted = [i._rebuild(expr=Eq(i.expr.lhs, i.expr.rhs.xreplace(mapper))) - for i in expressions] + exprs = FindNodes(Expression).visit(otree[-1]) + replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) + subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] - handle = Transformer(dict(zip(expressions, substituted))).visit(otree[-1]) - handle = handle._rebuild(nodes=(stmts + list(handle.nodes))) + handle = Transformer(dict(zip(exprs, subs))).visit(otree[-1]) + handle = handle._rebuild(nodes=(zip(*stmts)[0] + handle.nodes)) processed.append(tuple(otree[:-1]) + (handle,)) + # Temporary arrays are now local storage + for j in subs: + j.output_function._mem_stack = True + + # Introduce the new iteration variables within root + candidates = [j.output for j in subs] + exprs = FindNodes(Expression).visit(root[-1]) + replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) + subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] + handle = Transformer(dict(zip(exprs, subs))).visit(root[-1]) + handle = handle._rebuild(nodes=(zip(*stmts)[1] + handle.nodes)) + root = root[:-1] + (handle,) + return processed + [root] diff --git a/devito/dse/manipulation.py b/devito/dse/manipulation.py index 35dfe34b2d..7a59dbd2dd 100644 --- a/devito/dse/manipulation.py +++ b/devito/dse/manipulation.py @@ -7,14 +7,14 @@ from sympy import Indexed, collect, collect_const, flatten from devito.dse.extended_sympy import Add, Eq, Mul -from devito.dse.inspection import count, estimate_cost +from devito.dse.inspection import count, estimate_cost, retrieve_indexed from devito.dse.graph import temporaries_graph from devito.dse.queries import q_op from devito.interfaces import TensorFunction from devito.tools import as_tuple __all__ = ['collect_nested', 'common_subexprs_elimination', 'freeze_expression', - 'xreplace_constrained', 'promote_scalar_expressions'] + 'xreplace_constrained', 'xreplace_indices', 'promote_scalar_expressions'] def freeze_expression(expr): @@ -105,7 +105,7 @@ def run(expr): def xreplace_constrained(exprs, make, rule, cm=lambda e: True, repeat=False): """ - As opposed to ``xreplace``, which replaces all objects specified in a mapper, + Unlike ``xreplace``, which replaces all objects specified in a mapper, this function replaces all objects satisfying two criteria: :: * The "matching rule" -- a function returning True if a node within ``expr`` @@ -187,6 +187,20 @@ def run(expr): return found + rebuilt, found +def xreplace_indices(exprs, mapper, candidates=None, only_rhs=False): + """ + Create new expressions from ``exprs``, by replacing all index variables + specified in mapper appearing as a tensor index. Only tensors whose symbolic + name appears in ``candidates`` are considered if ``candidates`` is not None. + """ + get = lambda i: i.rhs if only_rhs is True else i + handle = flatten(retrieve_indexed(get(i)) for i in as_tuple(exprs)) + if candidates is not None: + handle = [i for i in handle if i.base.label in candidates] + mapper = dict(zip(handle, [i.xreplace(mapper) for i in handle])) + return [i.xreplace(mapper) for i in exprs] + + def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common subexpressions elimination. diff --git a/devito/interfaces.py b/devito/interfaces.py index 892c69be99..1d74a94a7a 100644 --- a/devito/interfaces.py +++ b/devito/interfaces.py @@ -215,7 +215,7 @@ def __init__(self, *args, **kwargs): self.shape = kwargs.get('shape') self.indices = kwargs.get('dimensions') self.dtype = kwargs.get('dtype', np.float32) - self.onstack = kwargs.get('onstack', False) + self._onstack = kwargs.get('onstack', False) @classmethod def _indices(cls, **kwargs): @@ -223,11 +223,15 @@ def _indices(cls, **kwargs): @property def _mem_stack(self): - return self.onstack + return self._onstack + + @_mem_stack.setter + def _mem_stack(self, val): + self._onstack = val @property def _mem_heap(self): - return not self.onstack + return not self._onstack class SymbolicData(AbstractSymbol): diff --git a/devito/operator.py b/devito/operator.py index 48f9de043a..bd1f1ba085 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -399,6 +399,7 @@ def _insert_declarations(self, dle_state, parameters): allocator.push_heap(k.output_function) # Introduce declarations on the stack + from IPython import embed; embed() for k, v in allocator.onstack: allocs = as_tuple([Element(i) for i in v]) mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen) @@ -497,6 +498,8 @@ def apply(self, *args, **kwargs): info("Section %s with OI=%.2f computed in %.3f s [Perf: %.2f GFlops/s]" % (name, v.oi, v.time, v.gflopss)) + print "u_sum = ", arguments['u'].sum() + print "v_sum = ", arguments['v'].sum() return summary def _arg_data(self, argument): From fa033a6331568ea948924c6fd8c44dcd65baadff Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2017 00:12:36 +0200 Subject: [PATCH 13/32] Make indexed AbstractSymbol point to the same function --- devito/interfaces.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/devito/interfaces.py b/devito/interfaces.py index 1d74a94a7a..006366bf3c 100644 --- a/devito/interfaces.py +++ b/devito/interfaces.py @@ -118,6 +118,12 @@ def __new__(cls, *args, **kwargs): options = kwargs.get('options', {}) newobj = Function.__new__(newcls, *args, **options) newobj.__init__(*args, **kwargs) + + # All objects cached on the AbstractSymbol /newobj/ keep a reference + # to /newobj/ through the /function/ field. Thus, all indexified + # object will point to /newobj/, the "actual Function". + newobj.function = newobj + # Store new instance in symbol cache newcls._cache_put(newobj) return newobj @@ -135,7 +141,7 @@ def dim(self): @property def indexed(self): """Extract a :class:`IndexedData` object from the current object.""" - return IndexedData(self.name, shape=self.shape, function=self) + return IndexedData(self.name, shape=self.shape, function=self.function) @property def _mem_external(self): From 1f673bd542cfa70de785a70d394387b669e9f8f7 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2017 19:25:56 +0200 Subject: [PATCH 14/32] dle: Enhance filter_iterations --- devito/dle/backends/advanced.py | 2 +- devito/dle/inspection.py | 28 ++++++++++++++++++++++------ devito/operator.py | 3 +-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 031d80b975..87471e603e 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -282,7 +282,7 @@ def _ompize(self, state, **kwargs): for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable - candidates = filter_iterations(tree, key=key, stop='consecutive') + candidates = filter_iterations(tree, key=key, stop='any') if not candidates: continue diff --git a/devito/dle/inspection.py b/devito/dle/inspection.py index 535a60541e..f45b6b097a 100644 --- a/devito/dle/inspection.py +++ b/devito/dle/inspection.py @@ -40,26 +40,42 @@ def retrieve_iteration_tree(node, mode='normal'): return match -def filter_iterations(tree, key=lambda i: i, stop=lambda i: False): +def filter_iterations(tree, key=lambda i: i, stop=lambda: False): """ Given an iterable of :class:`Iteration` objects, return a new list containing all items such that ``key(o)`` is True. This function accepts an optional argument ``stop``. This may be either a - lambda function, specifying a stop criterium, or the special keyword - 'consecutive', which makes the function return as soon as ``key(o)`` - gives False and at least one item has been collected. + lambda function, specifying a stop criterium, or any of the following + special keywords: :: + + * 'any': Return as soon as ``key(o)`` is False and at least one + item has been collected. + * 'asap': Return as soon as at least one item has been collected and + all items for which ``key(o)`` is False have been encountered. + + It is useful to specify a ``stop`` criterium when one is searching the + first Iteration in an Iteration/Expression tree for which a given property + does not hold. """ + assert callable(stop) or stop in ['any', 'asap'] + tree = list(tree) filtered = [] + off = [] - if stop == 'consecutive': + if stop == 'any': stop = lambda: len(filtered) > 0 + elif stop == 'asap': + hits = [i for i in tree if not key(i)] + stop = lambda: len(filtered) > 0 and len(off) == len(hits) for i in tree: if key(i): filtered.append(i) - elif stop(): + else: + off.append(i) + if stop(): break return filtered diff --git a/devito/operator.py b/devito/operator.py index bd1f1ba085..e2431f35d1 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -392,14 +392,13 @@ def _insert_declarations(self, dle_state, parameters): elif k.output_function._mem_stack: # On the stack, as established by the DLE key = lambda i: i.dim not in k.output_function.indices - site = filter_iterations(v, key=key, stop='consecutive') + site = filter_iterations(reversed(v), key=key, stop='asap') or [nodes] allocator.push_stack(site[-1], k.output_function) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.output_function) # Introduce declarations on the stack - from IPython import embed; embed() for k, v in allocator.onstack: allocs = as_tuple([Element(i) for i in v]) mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen) From 634f03fb9ed21d28b5ccf6c85dad15a6a76950fc Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2017 19:27:57 +0200 Subject: [PATCH 15/32] operator: Fix declarator with IterationFold blocking --- devito/operator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/devito/operator.py b/devito/operator.py index e2431f35d1..1faa53137d 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -22,7 +22,7 @@ from devito.stencil import Stencil from devito.tools import as_tuple, filter_ordered, flatten from devito.visitors import (FindSymbols, FindScopes, ResolveIterationVariable, - SubstituteExpression, Transformer) + SubstituteExpression, Transformer, NestedTransformer) from devito.exceptions import InvalidArgument, InvalidOperator __all__ = ['Operator'] @@ -400,9 +400,8 @@ def _insert_declarations(self, dle_state, parameters): # Introduce declarations on the stack for k, v in allocator.onstack: - allocs = as_tuple([Element(i) for i in v]) - mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen) - nodes = Transformer(mapper).visit(nodes) + mapper[k] = tuple(Element(i) for i in v) + nodes = NestedTransformer(mapper).visit(nodes) elemental_functions = Transformer(mapper).visit(dle_state.elemental_functions) # Introduce declarations on the heap (if any) From 45f56ae2868a41005ce6adbcde41b7858f554a8b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2017 19:29:31 +0200 Subject: [PATCH 16/32] interfaces: Allow updating state of SymbolicFunctions --- devito/interfaces.py | 24 +++++++++++++++++++----- devito/operator.py | 4 ++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/devito/interfaces.py b/devito/interfaces.py index 006366bf3c..9c933d5cd4 100644 --- a/devito/interfaces.py +++ b/devito/interfaces.py @@ -171,7 +171,13 @@ def indexify(self): class SymbolicFunction(AbstractSymbol): - """A symbolic function object.""" + + """ + A symbolic function object, created and managed directly by Devito. + + Unlike :class:`SymbolicData` objects, the state of a SymbolicFunction + is mutable. + """ is_SymbolicFunction = True @@ -179,6 +185,9 @@ def __new__(cls, *args, **kwargs): kwargs.update({'options': {'evaluate': False}}) return AbstractSymbol.__new__(cls, *args, **kwargs) + def update(self): + return + class ScalarFunction(SymbolicFunction): """Symbolic object representing a scalar. @@ -202,6 +211,9 @@ def _mem_stack(self): in a C module, False otherwise.""" return True + def update(self, dtype=None): + self.dtype = dtype or self.dtype + class TensorFunction(SymbolicFunction): """Symbolic object representing a tensor. @@ -231,14 +243,16 @@ def _indices(cls, **kwargs): def _mem_stack(self): return self._onstack - @_mem_stack.setter - def _mem_stack(self, val): - self._onstack = val - @property def _mem_heap(self): return not self._onstack + def update(self, dtype=None, shape=None, dimensions=None, onstack=None): + self.dtype = dtype or self.dtype + self.shape = shape or self.shape + self.indices = dimensions or self.indices + self._onstack = onstack or self._mem_stack + class SymbolicData(AbstractSymbol): """A symbolic object associated with data.""" diff --git a/devito/operator.py b/devito/operator.py index 1faa53137d..53be3fc157 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -404,6 +404,10 @@ def _insert_declarations(self, dle_state, parameters): nodes = NestedTransformer(mapper).visit(nodes) elemental_functions = Transformer(mapper).visit(dle_state.elemental_functions) + # TODO + # Use x_block_size+2, y_block_size+2 instead of x_size, y_size as shape of + # the tensor functions + # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) From 4618161f29e711663bf3af7ecea5a6d30e538183 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 12:15:48 +0200 Subject: [PATCH 17/32] Introduce symbolic_shape --- devito/dse/symbolics.py | 2 +- devito/interfaces.py | 19 +++++++++++++++++++ devito/nodes.py | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/devito/dse/symbolics.py b/devito/dse/symbolics.py index 511e61ce4a..339f0ec192 100644 --- a/devito/dse/symbolics.py +++ b/devito/dse/symbolics.py @@ -302,7 +302,7 @@ def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices - shape = g.space_shape + shape = tuple(i.symbolic_size for i in indices) # Template for captured redundancies name = self.conventions['redundancy'] + "%d" diff --git a/devito/interfaces.py b/devito/interfaces.py index 9c933d5cd4..bd2305c2a8 100644 --- a/devito/interfaces.py +++ b/devito/interfaces.py @@ -143,6 +143,25 @@ def indexed(self): """Extract a :class:`IndexedData` object from the current object.""" return IndexedData(self.name, shape=self.shape, function=self.function) + @property + def symbolic_shape(self): + """ + Return the symbolic shape of the object. For an entry ``E`` in ``self.shape``, + there are two possibilities: :: + + * ``E`` is already in symbolic form, then simply use ``E``. + * ``E`` is an integer representing the size along a dimension ``D``, + then, use a symbolic representation of ``D``. + """ + sshape = [] + for i, j in zip(self.shape, self.indices): + try: + i.is_algebraic_expr() + sshape.append(i) + except AttributeError: + sshape.append(j.symbolic_size) + return tuple(sshape) + @property def _mem_external(self): """Return True if the associated data was/is/will be allocated directly diff --git a/devito/nodes.py b/devito/nodes.py index 020c5f478b..ef70c0945f 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -488,7 +488,8 @@ def _ccasts(self): alignment = "__attribute__((aligned(64)))" handle = [f for f in self.parameters if isinstance(f, (SymbolicData, TensorFunction))] - shapes = [(f, ''.join(["[%s]" % i.ccode for i in f.indices[1:]])) for f in handle] + shapes = [(f, ''.join(["[%s]" % ccode(i) for i in f.symbolic_shape[1:]])) + for f in handle] casts = [c.Initializer(c.POD(v.dtype, '(*restrict %s)%s %s' % (v.name, shape, alignment)), '(%s (*)%s) %s' % (c.dtype_to_ctype(v.dtype), From daa0ff1c7adfdab2a9d72585710695e1231b87a6 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 15 Jun 2017 19:24:14 +0200 Subject: [PATCH 18/32] Always generate parameters, not numbers --- devito/cgen_utils.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/devito/cgen_utils.py b/devito/cgen_utils.py index c80c35a7a4..c34447de14 100644 --- a/devito/cgen_utils.py +++ b/devito/cgen_utils.py @@ -20,14 +20,10 @@ def push_stack(self, scope, obj): """ Generate a cgen statement that allocates ``obj`` on the stack. """ - dtype = c.dtype_to_ctype(obj.dtype) - shape = "".join("[%d]" % j for j in obj.shape) + shape = "".join("[%s]" % ccode(i) for i in obj.symbolic_shape) alignment = "__attribute__((aligned(64)))" - - item = c.POD(dtype, "%s%s %s" % (obj.name, shape, alignment)) - handle = self.stack.setdefault(scope, []) - if item not in handle: - handle.append(item) + handle = self.stack.setdefault(scope, OrderedDict()) + handle[obj] = c.POD(obj.dtype, "%s%s %s" % (obj.name, shape, alignment)) def push_heap(self, obj): """ @@ -37,10 +33,11 @@ def push_heap(self, obj): if obj in self.heap: return - decl = "(*%s)%s" % (obj.name, "".join("[%d]" % j for j in obj.shape[1:])) + decl = "(*%s)%s" % (obj.name, + "".join("[%s]" % i.symbolic_size for i in obj.indices[1:])) decl = c.Value(c.dtype_to_ctype(obj.dtype), decl) - shape = "".join("[%d]" % j for j in obj.shape) + shape = "".join("[%s]" % i.symbolic_size for i in obj.indices) alloc = "posix_memalign((void**)&%s, 64, sizeof(%s%s))" alloc = alloc % (obj.name, c.dtype_to_ctype(obj.dtype), shape) alloc = c.Statement(alloc) @@ -51,7 +48,7 @@ def push_heap(self, obj): @property def onstack(self): - return self.stack.items() + return [(k, v.values()) for k, v in self.stack.items()] @property def onheap(self): From 7e61775a160ae460bf8219964ae404b4164dbc9d Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 12:16:32 +0200 Subject: [PATCH 19/32] Decorate remainder loops --- devito/dle/backends/advanced.py | 6 ++++-- devito/nodes.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 87471e603e..3c7af0d6dc 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -168,14 +168,16 @@ def _loop_blocking(self, state, **kwargs): # non-blocked ("remainder") iterations. start = inter_block.limits[0] finish = inter_block.limits[1] - main = i._rebuild([], limits=[start, finish, 1], offsets=None) + main = i._rebuild([], limits=[start, finish, 1], offsets=None, + properties=('parallel', 'remainder')) # Build unitary-increment Iteration over the 'leftover' region: # again as above, this may be necessary when the dimension size # is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] - leftover = i._rebuild([], limits=[start, finish, 1], offsets=None) + leftover = i._rebuild([], limits=[start, finish, 1], offsets=None, + properties=('parallel', 'remainder')) regions[i] = Region(main, leftover) diff --git a/devito/nodes.py b/devito/nodes.py index ef70c0945f..1e152d944d 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -379,6 +379,10 @@ def is_Vectorizable(self): def is_Elementizable(self): return 'elemental' in self.properties + @property + def is_Remainder(self): + return 'remainder' in self.properties + @property def bounds_symbolic(self): """Return a 2-tuple representing the symbolic bounds of the object.""" From 7332982d862fba1fab46e435f2be1187d002e55f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 12:16:42 +0200 Subject: [PATCH 20/32] dle: Optimized IterationFold blocking [Unfinished] --- devito/dle/blocking_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index fdb635694d..43d459ae02 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -156,9 +156,12 @@ def optimize_unfolded_tree(unfolded, root): handle = handle._rebuild(nodes=(zip(*stmts)[0] + handle.nodes)) processed.append(tuple(otree[:-1]) + (handle,)) - # Temporary arrays are now local storage - for j in subs: - j.output_function._mem_stack = True + # Temporary arrays can now be moved to the stack + if all(not j.is_Remainder for j in otree): + shape = tuple(j.bounds_symbolic[1] for j in otree) + for j in subs: + shape += j.output_function.shape[len(otree):] + j.output_function.update(shape=shape, onstack=True) # Introduce the new iteration variables within root candidates = [j.output for j in subs] From 76703dfa100db90ebb64e41ae00ac26de844491a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 15:13:31 +0200 Subject: [PATCH 21/32] dle: Simplify and fix elemental functions --- devito/dimension.py | 7 +++-- devito/dle/backends/basic.py | 50 +++++++++++++----------------------- tests/test_dle.py | 16 ++++++------ 3 files changed, 31 insertions(+), 42 deletions(-) diff --git a/devito/dimension.py b/devito/dimension.py index 62bc16ae81..69a0a57452 100644 --- a/devito/dimension.py +++ b/devito/dimension.py @@ -1,7 +1,7 @@ import cgen import numpy as np -from sympy import Symbol +from sympy import Number, Symbol __all__ = ['Dimension', 'x', 'y', 'z', 't', 'p', 'd', 'time'] @@ -32,7 +32,10 @@ def __str__(self): @property def symbolic_size(self): """The symbolic size of this dimension.""" - return Symbol(self.ccode) + try: + return Number(self.ccode) + except ValueError: + return Symbol(self.ccode) @property def ccode(self): diff --git a/devito/dle/backends/basic.py b/devito/dle/backends/basic.py index be419fcecc..99dfa2be3d 100644 --- a/devito/dle/backends/basic.py +++ b/devito/dle/backends/basic.py @@ -68,41 +68,27 @@ def _create_elemental_functions(self, state, **kwargs): if ops < self.thresholds['elemental'] and not root.is_Elementizable: continue - # Determine the elemental function's arguments ... - already_in_scope = [k.dim for k in tree[tree.index(root):]] - required = [k for k in FindSymbols(mode='free-symbols').visit(root) - if k not in already_in_scope] - required += [as_symbol(k) for k in - set(flatten(k.free_symbols for k in root.bounds_symbolic))] - required = set(required) - + # Determine the arguments required by the elemental function + in_scope = [i.dim for i in tree[tree.index(root):]] + required = FindSymbols(mode='free-symbols').visit(root) + for i in FindSymbols('symbolics').visit(root): + required.extend(flatten(j.free_symbols for j in i.symbolic_shape)) + required = set([as_symbol(i) for i in required if i not in in_scope]) + + # Add tensor arguments args = [] seen = {e.output for e in expressions if e.is_scalar} - for d in FindSymbols('symbolics').visit(root): - # Add a necessary Symbolic object - handle = "(float*) %s" if d.is_SymbolicFunction else "%s_vec" - args.append((handle % d.name, d)) - seen |= {as_symbol(d)} - # Add necessary information related to Dimensions - for k in d.indices: - if k.size is not None: - continue - # Dimension size - size = k.symbolic_size - if size not in seen: - args.append((k.ccode, k)) - seen |= {size} - # Dimension index may be required too - if k in required - seen: - index_arg = (k.name, ScalarFunction(name=k.name, - dtype=np.int32)) - args.append(index_arg) - seen |= {k} - - # Add non-temporary scalars to the elemental function's arguments + for i in FindSymbols('symbolics').visit(root): + if i.is_SymbolicFunction: + handle = "(%s*) %s" % (c.dtype_to_ctype(i.dtype), i.name) + else: + handle = "%s_vec" % i.name + args.append((handle, i)) + seen |= {as_symbol(i)} + # Add scalar arguments handle = filter_sorted(required - seen, key=attrgetter('name')) - args.extend([(k.name, ScalarFunction(name=k.name, dtype=np.int32)) - for k in handle]) + args.extend([(i.name, ScalarFunction(name=i.name, dtype=np.int32)) + for i in handle]) # Track info to transform the main tree call, parameters = zip(*args) diff --git a/tests/test_dle.py b/tests/test_dle.py index 48b42c0f89..403847f59c 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -148,11 +148,11 @@ def test_create_elemental_functions_simple(simple_function): { for (int j = 0; j < 5; j += 1) { - f_0_0((float*) a,(float*) b,(float*) c,(float*) d,i,j); + f_0((float*) a,(float*) b,(float*) c,(float*) d,i,j); } } } -void f_0_0(float *restrict a_vec, float *restrict b_vec,""" +void f_0(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; @@ -187,15 +187,15 @@ def test_create_elemental_functions_complex(complex_function): float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int i = 0; i < 3; i += 1) { - f_0_0((float*) a,(float*) b,i); + f_0((float*) a,(float*) b,i); for (int j = 0; j < 5; j += 1) { - f_0_1((float*) a,(float*) b,(float*) c,(float*) d,i,j); + f_1((float*) a,(float*) b,(float*) c,(float*) d,i,j); } - f_0_2((float*) a,(float*) b,i); + f_2((float*) a,(float*) b,i); } } -void f_0_0(float *restrict a_vec, float *restrict b_vec, const int i) +void f_0(float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; @@ -204,7 +204,7 @@ def test_create_elemental_functions_complex(complex_function): b[i] = a[i] + pow(b[i], 2) + 3; } } -void f_0_1(float *restrict a_vec, float *restrict b_vec,""" +void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; @@ -217,7 +217,7 @@ def test_create_elemental_functions_complex(complex_function): a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } -void f_0_2(float *restrict a_vec, float *restrict b_vec, const int i) +void f_2(float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; From 5bc8d19187cc309abd59b6b5813e639e73c9e103 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 15:26:15 +0200 Subject: [PATCH 22/32] dse: Avoid creation of unnecessary functions --- devito/dse/symbolics.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/devito/dse/symbolics.py b/devito/dse/symbolics.py index 339f0ec192..33011cae71 100644 --- a/devito/dse/symbolics.py +++ b/devito/dse/symbolics.py @@ -328,13 +328,14 @@ def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): rules = OrderedDict() stencils = [] for c, (origin, alias) in enumerate(aliases.items()): - temporary = Indexed(template(c), *indices) + function = template(c) + temporary = Indexed(function, *indices) found.append(Eq(temporary, origin)) # Track the stencil of each TensorFunction introduced stencils.append(alias.anti_stencil.anti(cluster.stencil)) for aliased, distance in alias.with_distance: coordinates = [sum([i, j]) for i, j in distance.items() if i in indices] - rules[candidates[aliased]] = Indexed(template(c), *tuple(coordinates)) + rules[candidates[aliased]] = Indexed(function, *tuple(coordinates)) # Create the alias clusters alias_clusters = clusterize(found, stencils) From 1a1438c36f1d4bcde2593f0d62ab47cbe25cec5e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 16:46:19 +0200 Subject: [PATCH 23/32] Attach pragmas to Iteration, not through wraps --- devito/dle/backends/advanced.py | 15 ++++++++------- devito/dle/blocking_utils.py | 4 ++-- devito/nodes.py | 14 ++++++++++---- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 3c7af0d6dc..af314d4b04 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -256,11 +256,11 @@ def decorate(nodes): aligned = [] if aligned: simd = omplang['simd-for-aligned'] - simd = simd(','.join([j.name for j in aligned]), - simdinfo[get_simd_flag()]) + simd = as_tuple(simd(','.join([j.name for j in aligned]), + simdinfo[get_simd_flag()])) else: - simd = omplang['simd-for'] - mapper[i] = List(ignore_deps + as_tuple(simd), i) + simd = as_tuple(omplang['simd-for']) + mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed.append(Transformer(mapper).visit(node)) return processed @@ -294,13 +294,14 @@ def _ompize(self, state, **kwargs): nparallel = len(candidates) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: - parallelism = omplang['for'] + parallel = omplang['for'] else: - parallelism = omplang['collapse'](nparallel) + parallel = omplang['collapse'](nparallel) root = candidates[0] + parallel_root = root._rebuild(pragmas=root.pragmas + as_tuple(parallel)) mapper[root] = Block(header=omplang['par-region'], - body=denormals + [Element(parallelism), root]) + body=denormals + [parallel_root]) processed.append(Transformer(mapper).visit(node)) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index 43d459ae02..4ca9253d51 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -190,9 +190,9 @@ class IterationFold(Iteration): is_IterationFold = True def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None, folds=None): + properties=None, pragmas=None, folds=None): super(IterationFold, self).__init__(nodes, dimension, limits, index, - offsets, properties) + offsets, properties, pragmas) self.folds = folds def __repr__(self): diff --git a/devito/nodes.py b/devito/nodes.py index 1e152d944d..d3984cc360 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -256,6 +256,7 @@ class Iteration(Node): :param properties: A bag of strings indicating properties of this Iteration. For example, the string 'parallel' may be used to identify a parallelizable Iteration. + :param pragmas: A bag of pragmas attached to this Iteration. """ is_Iteration = True @@ -270,11 +271,12 @@ class Iteration(Node): executed in parallel. * vector-dim: A (SIMD) vectorizable iteration space. * elemental: Hoistable to an elemental function. + * remainder: A remainder iteration (e.g., by-product of some transformations) """ - _known_properties = ['sequential', 'parallel', 'vector-dim', 'elemental'] + _known_properties = ['sequential', 'parallel', 'vector-dim', 'elemental', 'remainder'] def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None): + properties=None, pragmas=None): # Ensure we deal with a list of Expression objects internally nodes = as_tuple(nodes) self.nodes = as_tuple([n if isinstance(n, Node) else Expression(n) @@ -304,9 +306,10 @@ def __init__(self, nodes, dimension, limits, index=None, offsets=None, self.offsets[0] = min(self.offsets[0], int(off)) self.offsets[1] = max(self.offsets[1], int(off)) - # Track this Iteration's properties + # Track this Iteration's properties and pragmas self.properties = as_tuple(properties) assert (i in Iteration._known_properties for i in self.properties) + self.pragmas = as_tuple(pragmas) def __repr__(self): properties = "" @@ -353,7 +356,10 @@ def ccode(self): loop_cond = '%s < %s' % (self.index, ccode(end)) loop_inc = '%s += %s' % (self.index, self.limits[2]) - return c.For(loop_init, loop_cond, loop_inc, c.Block(loop_body)) + handle = c.For(loop_init, loop_cond, loop_inc, c.Block(loop_body)) + if self.pragmas: + handle = c.Module(self.pragmas + (handle,)) + return handle @property def is_Open(self): From d5fe7a09ee32f139a7ffb1b5e0fcd8a2021aac6e Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 7 Jun 2017 15:44:34 +0200 Subject: [PATCH 24/32] dle: Move elemental functions to pipeline bottom --- devito/dle/backends/advanced.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index af314d4b04..4fd5df6cbd 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -31,11 +31,11 @@ class DevitoRewriter(BasicRewriter): def _pipeline(self, state): self._avoid_denormals(state) self._loop_fission(state) - #self._create_elemental_functions(state) self._loop_blocking(state) self._simdize(state) if self.params['openmp'] is True: self._ompize(state) + self._create_elemental_functions(state) @dle_pass def _loop_fission(self, state, **kwargs): @@ -314,12 +314,12 @@ def _pipeline(self, state): self._avoid_denormals(state) self._loop_fission(state) self._padding(state) - self._create_elemental_functions(state) self._loop_blocking(state) self._simdize(state) self._nontemporal_stores(state) if self.params['openmp'] is True: self._ompize(state) + self._create_elemental_functions(state) @dle_pass def _padding(self, state, **kwargs): From e85b5a29d5a3122e0b4ee08fb2a7509c626c9e4b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 22 Jun 2017 18:01:52 +0200 Subject: [PATCH 25/32] dle: Build up a single parallel region --- devito/dle/backends/advanced.py | 25 +++++++++++++++++++------ devito/dle/backends/utils.py | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 4fd5df6cbd..4b6f969f14 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -278,9 +278,10 @@ def _ompize(self, state, **kwargs): # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) - mapper = {i: List(c.Comment('DLE: moved denormals flag')) for i in denormals} + mapper = OrderedDict([(i, None) for i in denormals]) # Handle parallelizable loops + pvt = [] for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable @@ -299,11 +300,23 @@ def _ompize(self, state, **kwargs): parallel = omplang['collapse'](nparallel) root = candidates[0] - parallel_root = root._rebuild(pragmas=root.pragmas + as_tuple(parallel)) - mapper[root] = Block(header=omplang['par-region'], - body=denormals + [parallel_root]) - - processed.append(Transformer(mapper).visit(node)) + mapper[root] = root._rebuild(pragmas=root.pragmas + as_tuple(parallel)) + + # Track the thread-private and thread-shared variables + pvt += [i for i in FindSymbols('symbolics').visit(tree) if i._mem_stack] + + # Build the parallel region + pvt = sorted(set([i.name for i in pvt])) + pvt = ('private(%s)' % ','.join(pvt)) if pvt else '' + par_region = Block(header=omplang['par-region'](pvt), + body=denormals + [i for i in mapper.values() if i]) + for k, v in list(mapper.items()): + if v is not None: + mapper[k] = None if v.is_Remainder else par_region + + handle = Transformer(mapper).visit(node) + if handle is not None: + processed.append(handle) return {'nodes': processed} diff --git a/devito/dle/backends/utils.py b/devito/dle/backends/utils.py index c33d22f687..d5c504ec5a 100644 --- a/devito/dle/backends/utils.py +++ b/devito/dle/backends/utils.py @@ -9,7 +9,7 @@ omplang = { 'for': c.Pragma('omp for schedule(static)'), 'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i), - 'par-region': c.Pragma('omp parallel'), + 'par-region': lambda i: c.Pragma('omp parallel %s' % i), 'par-for': c.Pragma('omp parallel for schedule(static)'), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)) From 84fd358d9f58933f694f2c30122e5e44cfbffd31 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 23 Jun 2017 11:37:58 +0200 Subject: [PATCH 26/32] operator: Introduce OperatorDebug for C-level debug --- devito/cgen_utils.py | 2 ++ devito/operator.py | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/devito/cgen_utils.py b/devito/cgen_utils.py index c34447de14..a94dbd5f49 100644 --- a/devito/cgen_utils.py +++ b/devito/cgen_utils.py @@ -172,3 +172,5 @@ def ccode_eq(eq, **settings): blankline = c.Line("") +printmark = lambda i: c.Line('printf("Here: %s\\n"); fflush(stdout);' % i) +printvar = lambda i: c.Statement('printf("%s=%%s\\n", %s); fflush(stdout);' % (i, i)) diff --git a/devito/operator.py b/devito/operator.py index 53be3fc157..9c1c45ddc0 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -9,10 +9,11 @@ import sympy from devito.autotuning import autotune -from devito.cgen_utils import Allocator, blankline +from devito.cgen_utils import Allocator, blankline, printmark, printvar from devito.compiler import jit_compile, load from devito.dimension import Dimension, time -from devito.dle import compose_nodes, filter_iterations, transform +from devito.dle import (compose_nodes, filter_iterations, + retrieve_iteration_tree, transform) from devito.dse import clusterize, indexify, rewrite, q_indexed from devito.interfaces import SymbolicData, Forward, Backward from devito.logger import bar, error, info @@ -542,11 +543,40 @@ def _cglobals(self): return [self.profiler.ctype, blankline] +class OperatorDebug(OperatorCore): + """ + Decorate the generated code with useful print statements. + """ + + def __init__(self, expressions, **kwargs): + super(OperatorDebug, self).__init__(expressions, **kwargs) + self._includes.append('stdio.h') + + # Minimize the trip count of the sequential loops + iterations = set(flatten(retrieve_iteration_tree(self.body))) + mapper = {i: i._rebuild(limits=(max(i.offsets) + 2)) + for i in iterations if i.is_Sequential} + self.body = Transformer(mapper).visit(self.body) + + # Mark entry/exit points of each non-sequential Iteration tree in the body + iterations = [filter_iterations(i, lambda i: not i.is_Sequential, 'any') + for i in retrieve_iteration_tree(self.body)] + iterations = [i[0] for i in iterations if i] + mapper = {t: List(header=printmark('In nest %d' % i), body=t) + for i, t in enumerate(iterations)} + self.body = Transformer(mapper).visit(self.body) + + class Operator(object): def __new__(cls, *args, **kwargs): - # What type of Operator should I return ? - cls = OperatorForeign if kwargs.pop('external', False) else OperatorCore + # What type of Operator should I create ? + if kwargs.pop('external', False): + cls = OperatorForeign + elif kwargs.pop('debug', False): + cls = OperatorDebug + else: + cls = OperatorCore # Trigger instantiation obj = cls.__new__(cls, *args, **kwargs) From 9a7d3f695fa51e58f9b76d34ab0f4ec10688bb8f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 23 Jun 2017 18:06:55 +0200 Subject: [PATCH 27/32] dle: Generalize blocking pass for IterationFold [Finished] --- devito/dle/backends/advanced.py | 59 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 4b6f969f14..cceadca30c 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -2,7 +2,7 @@ from __future__ import absolute_import -from collections import OrderedDict, namedtuple +from collections import OrderedDict from itertools import combinations import numpy as np @@ -120,7 +120,6 @@ def _loop_blocking(self, state, **kwargs): have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. """ - Region = namedtuple('Region', 'main leftover') exclude_innermost = 'blockinner' not in self.params blocked = OrderedDict() @@ -141,10 +140,11 @@ def _loop_blocking(self, state, **kwargs): if not IsPerfectIteration().visit(root): continue - # Construct the blocked loop nest, as well as all necessary - # remainder loops - regions = OrderedDict() - blocked_iterations = [] + # Build all necessary Iteration objects, individually. These will + # subsequently be composed to implement loop blocking. + inter_blocks = [] + intra_blocks = [] + remainders = [] for i in iterations: # Build Iteration over blocks dim = blocked.setdefault(i, Dimension("%s_block" % i.dim.name)) @@ -155,46 +155,43 @@ def _loop_blocking(self, state, **kwargs): finish = finish - ((finish - i.offsets[1]) % block_size) inter_block = Iteration([], dim, [start, finish, block_size], properties=as_tuple('parallel')) + inter_blocks.append(inter_block) # Build Iteration within a block start = inter_block.dim finish = start + block_size intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None) + intra_blocks.append(intra_block) - blocked_iterations.append((inter_block, intra_block)) - - # Build unitary-increment Iteration over the 'main' region - # (the one blocked); necessary to generate code iterating over - # non-blocked ("remainder") iterations. - start = inter_block.limits[0] - finish = inter_block.limits[1] - main = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=('parallel', 'remainder')) - - # Build unitary-increment Iteration over the 'leftover' region: - # again as above, this may be necessary when the dimension size - # is not a multiple of the block size. + # Build unitary-increment Iteration over the 'leftover' region. + # This will be used for remainder loops, executed when any + # dimension size is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] - leftover = i._rebuild([], limits=[start, finish, 1], offsets=None, - properties=('parallel', 'remainder')) - - regions[i] = Region(main, leftover) + remainder = i._rebuild([], limits=[start, finish, 1], offsets=None) + remainders.append(remainder) - blocked_tree = list(flatten(zip(*blocked_iterations))) - blocked_tree = compose_nodes(blocked_tree + [iterations[-1].nodes]) + # Build blocked Iteration nest + blocked_tree = [compose_nodes(inter_blocks + intra_blocks + + [iterations[-1].nodes])] - # Build remainder loops + # Build remainder Iterations remainder_tree = [] for n in range(len(iterations)): - for i in combinations(iterations, n + 1): - nodes = [v.leftover if k in i else v.main - for k, v in regions.items()] - nodes += [iterations[-1].nodes] + for c in combinations([i.dim for i in iterations], n + 1): + # First all inter-block Interations + nodes = [b for b, r in zip(inter_blocks, remainders) + if r.dim not in c] + # Then intra-block or remainder, for each dim (in order) + for b, r in zip(intra_blocks, remainders): + nodes.extend([r] if b.dim in c else [b]) + nodes = [i._rebuild(properties=i.properties + ('remainder',)) + for i in nodes] + nodes.extend([iterations[-1].nodes]) remainder_tree.append(compose_nodes(nodes)) # Will replace with blocked loop tree - mapper[root] = List(body=[blocked_tree] + remainder_tree) + mapper[root] = List(body=blocked_tree + remainder_tree) rebuilt = Transformer(mapper).visit(fold) From 09074c89cbc916a7770717d77e5f84ecc078ffd2 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 23 Jun 2017 18:09:08 +0200 Subject: [PATCH 28/32] flake8 fixes --- devito/dle/backends/advanced.py | 6 ++---- devito/dle/blocking_utils.py | 2 +- devito/operator.py | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index cceadca30c..d359e92b7f 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -8,8 +8,6 @@ import numpy as np import psutil -import cgen as c - from devito.dimension import Dimension from devito.dle import (compose_nodes, copy_arrays, filter_iterations, fold_blockable_tree, unfold_blocked_tree, @@ -20,8 +18,8 @@ from devito.exceptions import DLEException from devito.interfaces import TensorFunction from devito.logger import dle_warning -from devito.nodes import Block, Denormals, Element, Expression, Iteration, List -from devito.tools import as_tuple, flatten, grouper, roundm +from devito.nodes import Block, Denormals, Expression, Iteration, List +from devito.tools import as_tuple, grouper, roundm from devito.visitors import (FindNodes, FindSymbols, IsPerfectIteration, SubstituteExpression, Transformer) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index 4ca9253d51..c7e1783a63 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -7,7 +7,7 @@ from devito.nodes import Expression, Iteration, List, LocalExpression from devito.visitors import (FindAdjacentIterations, FindNodes, IsPerfectIteration, NestedTransformer, Transformer) -from devito.tools import as_tuple, flatten +from devito.tools import as_tuple __all__ = ['fold_blockable_tree', 'unfold_blocked_tree'] diff --git a/devito/operator.py b/devito/operator.py index 9c1c45ddc0..b407c46b6c 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -9,7 +9,7 @@ import sympy from devito.autotuning import autotune -from devito.cgen_utils import Allocator, blankline, printmark, printvar +from devito.cgen_utils import Allocator, blankline, printmark from devito.compiler import jit_compile, load from devito.dimension import Dimension, time from devito.dle import (compose_nodes, filter_iterations, @@ -501,8 +501,6 @@ def apply(self, *args, **kwargs): info("Section %s with OI=%.2f computed in %.3f s [Perf: %.2f GFlops/s]" % (name, v.oi, v.time, v.gflopss)) - print "u_sum = ", arguments['u'].sum() - print "v_sum = ", arguments['v'].sum() return summary def _arg_data(self, argument): From 20e07aa769f15fdda2440c8912b13fd0a0d8a9fa Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 26 Jun 2017 15:31:17 +0200 Subject: [PATCH 29/32] dle: Enhance loop dependence analysis --- devito/dle/backends/common.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py index 26b2e1036a..0364b465f5 100644 --- a/devito/dle/backends/common.py +++ b/devito/dle/backends/common.py @@ -172,24 +172,30 @@ def _analyze(self, state): if not has_parallel_dimension: continue - # Is the Iteration tree fully-parallel or OSIP? + # Determine if fully-parallel (FP), OSIP, unit-stride (in innermost dim) + is_FP = True is_OSIP = False + is_US = True for e1 in exprs: lhs = e1.lhs if lhs.is_Symbol: continue for e2 in exprs: handle = [i for i in terms[e2] if as_symbol(i) == as_symbol(lhs)] - if any(lhs.indices[0] != i.indices[0] for i in handle): - is_OSIP = True - break + is_FP &= len(handle) == 0 + is_OSIP |= any(lhs.indices[0] != i.indices[0] for i in handle) + is_US &= all(lhs.indices[-1] == i.indices[-1] for i in handle) + + # Is the innermost Iteration vectorizable? + is_Vectorizable = is_FP or is_OSIP or is_US # Track the discovered properties if is_OSIP: mapper.setdefault(tree[0], []).append('sequential') - for i in tree[is_OSIP:-1]: + for i in tree[is_OSIP:]: mapper.setdefault(i, []).append('parallel') - mapper.setdefault(tree[-1], []).extend(['parallel', 'vector-dim']) + if is_Vectorizable: + mapper.setdefault(tree[-1], []).append('vector-dim') # Introduce the discovered properties in the Iteration/Expression tree for k, v in list(mapper.items()): From 399ee7c53bd36220c3a77e04f51f31b0c4cf3741 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 26 Jun 2017 15:42:17 +0200 Subject: [PATCH 30/32] tests: Update expected output --- tests/test_operator.py | 54 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_operator.py b/tests/test_operator.py index 0e2dc499fd..bb609491f8 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -264,15 +264,15 @@ def test_heap_1D_stencil(self, a, b): assert """\ float (*a); posix_memalign((void**)&a, 64, sizeof(float[3])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { a[i] = a[i] + b[i] + 5.0F; } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); return 0;""" in str(operator.ccode) @@ -283,8 +283,8 @@ def test_heap_perfect_2D_stencil(self, a, c): float (*c)[5]; posix_memalign((void**)&a, 64, sizeof(float[3])); posix_memalign((void**)&c, 64, sizeof(float[3][5])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) @@ -293,9 +293,9 @@ def test_heap_perfect_2D_stencil(self, a, c): c[i][j] = a[i]*c[i][j]; } } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); free(c); return 0;""" in str(operator.ccode) @@ -307,19 +307,19 @@ def test_heap_imperfect_2D_stencil(self, a, c): float (*c)[5]; posix_memalign((void**)&a, 64, sizeof(float[3])); posix_memalign((void**)&c, 64, sizeof(float[3][5])); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { a[i] = 0.0F; - struct timeval start_loop_j_0, end_loop_j_0; - gettimeofday(&start_loop_j_0, NULL); for (int j = 0; j < 5; j += 1) { c[i][j] = a[i]*c[i][j]; } - gettimeofday(&end_loop_j_0, NULL); - timings->loop_j_0 += (double)(end_loop_j_0.tv_sec-start_loop_j_0.tv_sec)\ -+(double)(end_loop_j_0.tv_usec-start_loop_j_0.tv_usec)/1000000; } + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); free(c); return 0;""" in str(operator.ccode) @@ -330,32 +330,32 @@ def test_stack_scalar_temporaries(self, a, t0, t1): assert """\ float (*a); posix_memalign((void**)&a, 64, sizeof(float[3])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { float t0 = 1.00000000000000F; float t1 = 2.00000000000000F; a[i] = 3.0F*t0*t1; } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); return 0;""" in str(operator.ccode) def test_stack_vector_temporaries(self, c_stack, e): operator = Operator([Eq(c_stack, e*1.)], dse='noop', dle='noop') assert """\ - struct timeval start_loop_k_0, end_loop_k_0; - gettimeofday(&start_loop_k_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int k = 0; k < 7; k += 1) { for (int s = 0; s < 4; s += 1) { for (int q = 0; q < 4; q += 1) { - double c_stack[3][5] __attribute__((aligned(64))); + float c_stack[3][5] __attribute__((aligned(64))); for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) @@ -366,9 +366,9 @@ def test_stack_vector_temporaries(self, c_stack, e): } } } - gettimeofday(&end_loop_k_0, NULL); - timings->loop_k_0 += (double)(end_loop_k_0.tv_sec-start_loop_k_0.tv_sec)\ -+(double)(end_loop_k_0.tv_usec-start_loop_k_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; return 0;""" in str(operator.ccode) @@ -403,7 +403,7 @@ def test_expressions_imperfect_loops(self, ti0, ti1, ti2, t0): assert outer[-1].nodes[0].expr.rhs == eq1.rhs assert inner[-1].nodes[0].expr.rhs == eq2.rhs - def test_different_loop_nests(self, tu, ti0, t0, t1): + def test_different_section_nests(self, tu, ti0, t0, t1): eq1 = Eq(ti0, t0*3.) eq2 = Eq(tu, ti0 + t1*3.) op = Operator([eq1, eq2], dse='noop', dle='noop') From 34994aa00bf6591318b0a7aaedd62539d53e4eac Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 27 Jun 2017 11:57:14 +0200 Subject: [PATCH 31/32] dse: Reschedule cluster after some passes --- devito/dse/clusterizer.py | 14 +++++++++++++ devito/dse/graph.py | 42 +++++++++++++++++++++++++++++++++++++++ devito/dse/symbolics.py | 6 +++--- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/devito/dse/clusterizer.py b/devito/dse/clusterizer.py index 01cf4a0e9c..c05828bb22 100644 --- a/devito/dse/clusterizer.py +++ b/devito/dse/clusterizer.py @@ -34,6 +34,20 @@ def is_sparse(self): return not self.is_dense def rebuild(self, exprs): + """ + Build a new cluster with expressions ``exprs`` having same stencil + as ``self``. + """ + return Cluster(exprs, self.stencil) + + def reschedule(self, exprs): + """ + Build a new cluster with expressions ``exprs`` having same stencil + as ``self``. The order of the expressions in the new cluster is such that + self's ordering is honored. + """ + g = temporaries_graph(exprs) + exprs = g.reschedule(self.exprs) return Cluster(exprs, self.stencil) diff --git a/devito/dse/graph.py b/devito/dse/graph.py index a6d45ef1a8..c5801fedd6 100644 --- a/devito/dse/graph.py +++ b/devito/dse/graph.py @@ -174,6 +174,48 @@ def trace(self, key): [(k, v)] + list(queue.items())) return found.values() + def reschedule(self, context): + """ + Starting from the temporaries in ``self``, return a new sequence of + expressions that: :: + + * includes all expressions in ``context`` not appearing in ``self``, and + * is ordered so that the ordering in ``context`` is honored. + + Examples + ======== + Assume that: :: + + * ``self`` has five temporaries ``[t0, t1, t2, e1, e2]``, + * ``t1`` depends on the temporary ``e1``, and ``t2`` depends on ``t1`` + * ``context = [e1, e2]`` + + Then the following sequence is returned ``[t0, e1, t1, t2, e2]``. + + If, instead, we had had everything like before except: :: + + * ``context = [t1, e1, e2]`` + + Then the following sequence is returned ``[t0, t1, t2, e1, e2]``. + That is, in the latter example the original ordering dictated by ``context`` + was honored. + """ + processed = [i.lhs for i in context] + queue = [i for i in self if i not in processed] + while queue: + k = queue.pop(0) + handle = self[k].readby + if all(i in processed for i in handle): + index = min(processed.index(i) for i in handle) + processed.insert(index, k) + else: + # Note: push at the back + queue.append(k) + + processed = [self[i] for i in processed] + + return processed + def time_invariant(self, expr=None): """ Check if ``expr`` is time invariant. ``expr`` may be an expression ``e`` diff --git a/devito/dse/symbolics.py b/devito/dse/symbolics.py index 33011cae71..471751bf74 100644 --- a/devito/dse/symbolics.py +++ b/devito/dse/symbolics.py @@ -176,7 +176,7 @@ def _extract_time_varying(self, cluster, **kwargs): processed, _ = xreplace_constrained(cluster.exprs, make, rule, cm) - return cluster.rebuild(processed) + return cluster.reschedule(processed) @dse_pass def _extract_time_invariants(self, cluster, **kwargs): @@ -202,7 +202,7 @@ def _extract_time_invariants(self, cluster, **kwargs): found = common_subexprs_elimination(found, make) - return cluster.rebuild(found + leaves) + return cluster.reschedule(found + leaves) @dse_pass def _eliminate_intra_stencil_redundancies(self, cluster, **kwargs): @@ -219,7 +219,7 @@ def _eliminate_intra_stencil_redundancies(self, cluster, **kwargs): processed = common_subexprs_elimination(candidates, make) - return cluster.rebuild(skip + processed) + return cluster.reschedule(skip + processed) @dse_pass def _factorize(self, cluster, **kwargs): From a5db600f9b196721013a19a7b42be0709c70e962 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 29 Jun 2017 16:09:37 +0200 Subject: [PATCH 32/32] docstring fixes --- devito/dle/backends/advanced.py | 6 +++--- devito/dle/blocking_utils.py | 2 +- devito/dle/inspection.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index d359e92b7f..f5017dfe3c 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -103,8 +103,8 @@ def _loop_blocking(self, state, **kwargs): Blocking is applied to parallel iteration trees. Heuristically, innermost dimensions are not blocked to maximize the trip count of the SIMD loops. - Different heuristics may be specified via ``kwargs['blockshape']`` and - ``kwargs['blockinner']``. The former, a dictionary, is used to indicate + Different heuristics may be specified by passing the keywords ``blockshape`` + and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate a specific block size for each blocked dimension. For example, for the :class:`Iteration` tree: :: @@ -113,7 +113,7 @@ def _loop_blocking(self, state, **kwargs): for k ... - one may provide ``kwargs['blockshape'] = {i: 4, j: 7}``, in which case the + one may provide ``blockshape = {i: 4, j: 7}``, in which case the two outer loops will blocked, and the resulting 2-dimensional block will have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py index c7e1783a63..f343f7f684 100644 --- a/devito/dle/blocking_utils.py +++ b/devito/dle/blocking_utils.py @@ -205,7 +205,7 @@ def __repr__(self): @property def ccode(self): - comment = c.Comment('This IterationFold is "hiding" ore or more Iterations') + comment = c.Comment('This IterationFold is "hiding" one or more Iterations') code = super(IterationFold, self).ccode return c.Module([comment, code]) diff --git a/devito/dle/inspection.py b/devito/dle/inspection.py index f45b6b097a..55a976cf72 100644 --- a/devito/dle/inspection.py +++ b/devito/dle/inspection.py @@ -23,7 +23,7 @@ def retrieve_iteration_tree(node, mode='normal'): :param node: The searched Iteration/Expression tree. :param mode: Accepted values are 'normal' (default) and 'superset', in which - case iteration trees that are subset of larget iteration trees + case iteration trees that are subset of larger iteration trees are dropped. """ assert mode in ('normal', 'superset')