diff --git a/devito/cgen_utils.py b/devito/cgen_utils.py index c80c35a7a4..a94dbd5f49 100644 --- a/devito/cgen_utils.py +++ b/devito/cgen_utils.py @@ -20,14 +20,10 @@ def push_stack(self, scope, obj): """ Generate a cgen statement that allocates ``obj`` on the stack. """ - dtype = c.dtype_to_ctype(obj.dtype) - shape = "".join("[%d]" % j for j in obj.shape) + shape = "".join("[%s]" % ccode(i) for i in obj.symbolic_shape) alignment = "__attribute__((aligned(64)))" - - item = c.POD(dtype, "%s%s %s" % (obj.name, shape, alignment)) - handle = self.stack.setdefault(scope, []) - if item not in handle: - handle.append(item) + handle = self.stack.setdefault(scope, OrderedDict()) + handle[obj] = c.POD(obj.dtype, "%s%s %s" % (obj.name, shape, alignment)) def push_heap(self, obj): """ @@ -37,10 +33,11 @@ def push_heap(self, obj): if obj in self.heap: return - decl = "(*%s)%s" % (obj.name, "".join("[%d]" % j for j in obj.shape[1:])) + decl = "(*%s)%s" % (obj.name, + "".join("[%s]" % i.symbolic_size for i in obj.indices[1:])) decl = c.Value(c.dtype_to_ctype(obj.dtype), decl) - shape = "".join("[%d]" % j for j in obj.shape) + shape = "".join("[%s]" % i.symbolic_size for i in obj.indices) alloc = "posix_memalign((void**)&%s, 64, sizeof(%s%s))" alloc = alloc % (obj.name, c.dtype_to_ctype(obj.dtype), shape) alloc = c.Statement(alloc) @@ -51,7 +48,7 @@ def push_heap(self, obj): @property def onstack(self): - return self.stack.items() + return [(k, v.values()) for k, v in self.stack.items()] @property def onheap(self): @@ -175,3 +172,5 @@ def ccode_eq(eq, **settings): blankline = c.Line("") +printmark = lambda i: c.Line('printf("Here: %s\\n"); fflush(stdout);' % i) +printvar = lambda i: c.Statement('printf("%s=%%s\\n", %s); fflush(stdout);' % (i, i)) diff --git a/devito/dimension.py b/devito/dimension.py index 62bc16ae81..69a0a57452 100644 --- a/devito/dimension.py +++ b/devito/dimension.py @@ -1,7 +1,7 @@ import cgen import numpy as np -from sympy import Symbol +from sympy import Number, Symbol __all__ = ['Dimension', 'x', 'y', 'z', 't', 'p', 'd', 'time'] @@ -32,7 +32,10 @@ def __str__(self): @property def symbolic_size(self): """The symbolic size of this dimension.""" - return Symbol(self.ccode) + try: + return Number(self.ccode) + except ValueError: + return Symbol(self.ccode) @property def ccode(self): diff --git a/devito/dle/__init__.py b/devito/dle/__init__.py index e1e708e079..f4ee089bf0 100644 --- a/devito/dle/__init__.py +++ b/devito/dle/__init__.py @@ -1,4 +1,5 @@ from devito.dle.inspection import * # noqa from devito.dle.manipulation import * # noqa +from devito.dle.blocking_utils import * # noqa from devito.dle.transformer import * # noqa from devito.dle.backends import YaskGrid, init # noqa diff --git a/devito/dle/backends/advanced.py b/devito/dle/backends/advanced.py index 93e06e8599..f5017dfe3c 100644 --- a/devito/dle/backends/advanced.py +++ b/devito/dle/backends/advanced.py @@ -2,16 +2,15 @@ from __future__ import absolute_import -from collections import OrderedDict, namedtuple +from collections import OrderedDict from itertools import combinations import numpy as np import psutil -import cgen as c - from devito.dimension import Dimension from devito.dle import (compose_nodes, copy_arrays, filter_iterations, + fold_blockable_tree, unfold_blocked_tree, retrieve_iteration_tree) from devito.dle.backends import (BasicRewriter, BlockingArg, dle_pass, omplang, simdinfo, get_simd_flag, get_simd_items) @@ -19,8 +18,8 @@ from devito.exceptions import DLEException from devito.interfaces import TensorFunction from devito.logger import dle_warning -from devito.nodes import Block, Denormals, Element, Expression, Iteration, List -from devito.tools import as_tuple, flatten, grouper, roundm +from devito.nodes import Block, Denormals, Expression, Iteration, List +from devito.tools import as_tuple, grouper, roundm from devito.visitors import (FindNodes, FindSymbols, IsPerfectIteration, SubstituteExpression, Transformer) @@ -30,11 +29,11 @@ class DevitoRewriter(BasicRewriter): def _pipeline(self, state): self._avoid_denormals(state) self._loop_fission(state) - self._create_elemental_functions(state) self._loop_blocking(state) self._simdize(state) if self.params['openmp'] is True: self._ompize(state) + self._create_elemental_functions(state) @dle_pass def _loop_fission(self, state, **kwargs): @@ -101,36 +100,37 @@ def _loop_blocking(self, state, **kwargs): """ Apply loop blocking to :class:`Iteration` trees. - By default, the blocked :class:`Iteration` objects and the block size are - determined heuristically. The heuristic consists of searching the deepest - Iteration/Expression tree and blocking all dimensions except: - - * The innermost (eg, to retain SIMD vectorization); - * Those dimensions inducing loop-carried dependencies. + Blocking is applied to parallel iteration trees. Heuristically, innermost + dimensions are not blocked to maximize the trip count of the SIMD loops. - The caller may take over the heuristic through ``kwargs['blocking']``, - a dictionary indicating the block size of each blocked dimension. For - example, for the :class:`Iteration` tree below: :: + Different heuristics may be specified by passing the keywords ``blockshape`` + and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate + a specific block size for each blocked dimension. For example, for the + :class:`Iteration` tree: :: for i for j for k ... - one may pass in ``kwargs['blocking'] = {i: 4, j: 7}``, in which case the - two outer loops would be blocked, and the resulting 2-dimensional block - would be of size 4x7. + one may provide ``blockshape = {i: 4, j: 7}``, in which case the + two outer loops will blocked, and the resulting 2-dimensional block will + have size 4x7. The latter may be set to True to also block innermost parallel + :class:`Iteration` objects. """ - Region = namedtuple('Region', 'main leftover') + exclude_innermost = 'blockinner' not in self.params blocked = OrderedDict() processed = [] for node in state.nodes: + # Make sure loop blocking will span as many Iterations as possible + fold = fold_blockable_tree(node, exclude_innermost) + mapper = {} - for tree in retrieve_iteration_tree(node): + for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] - if 'blockinner' not in self.params: + if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if not iterations: continue @@ -138,10 +138,11 @@ def _loop_blocking(self, state, **kwargs): if not IsPerfectIteration().visit(root): continue - # Construct the blocked loop nest, as well as all necessary - # remainder loops - regions = OrderedDict() - blocked_iterations = [] + # Build all necessary Iteration objects, individually. These will + # subsequently be composed to implement loop blocking. + inter_blocks = [] + intra_blocks = [] + remainders = [] for i in iterations: # Build Iteration over blocks dim = blocked.setdefault(i, Dimension("%s_block" % i.dim.name)) @@ -152,52 +153,48 @@ def _loop_blocking(self, state, **kwargs): finish = finish - ((finish - i.offsets[1]) % block_size) inter_block = Iteration([], dim, [start, finish, block_size], properties=as_tuple('parallel')) + inter_blocks.append(inter_block) # Build Iteration within a block start = inter_block.dim finish = start + block_size - properties = 'vector-dim' if i.is_Vectorizable else None - intra_block = Iteration([], i.dim, [start, finish, 1], i.index, - properties=as_tuple(properties)) - - blocked_iterations.append((inter_block, intra_block)) - - # Build unitary-increment Iteration over the 'main' region - # (the one blocked); necessary to generate code iterating over - # non-blocked ("remainder") iterations. - start = inter_block.limits[0] - finish = inter_block.limits[1] - main = Iteration([], i.dim, [start, finish, 1], i.index, - properties=i.properties) - - # Build unitary-increment Iteration over the 'leftover' region: - # again as above, this may be necessary when the dimension size - # is not a multiple of the block size. + intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None) + intra_blocks.append(intra_block) + + # Build unitary-increment Iteration over the 'leftover' region. + # This will be used for remainder loops, executed when any + # dimension size is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] - leftover = Iteration([], i.dim, [start, finish, 1], i.index, - properties=i.properties) + remainder = i._rebuild([], limits=[start, finish, 1], offsets=None) + remainders.append(remainder) - regions[i] = Region(main, leftover) + # Build blocked Iteration nest + blocked_tree = [compose_nodes(inter_blocks + intra_blocks + + [iterations[-1].nodes])] - blocked_tree = list(flatten(zip(*blocked_iterations))) - blocked_tree = compose_nodes(blocked_tree + [iterations[-1].nodes]) - - # Build remainder loops + # Build remainder Iterations remainder_tree = [] for n in range(len(iterations)): - for i in combinations(iterations, n + 1): - nodes = [v.leftover if k in i else v.main - for k, v in regions.items()] - nodes += [iterations[-1].nodes] + for c in combinations([i.dim for i in iterations], n + 1): + # First all inter-block Interations + nodes = [b for b, r in zip(inter_blocks, remainders) + if r.dim not in c] + # Then intra-block or remainder, for each dim (in order) + for b, r in zip(intra_blocks, remainders): + nodes.extend([r] if b.dim in c else [b]) + nodes = [i._rebuild(properties=i.properties + ('remainder',)) + for i in nodes] + nodes.extend([iterations[-1].nodes]) remainder_tree.append(compose_nodes(nodes)) # Will replace with blocked loop tree - mapper[root] = List(body=[blocked_tree] + remainder_tree) + mapper[root] = List(body=blocked_tree + remainder_tree) - rebuilt = Transformer(mapper).visit(node) + rebuilt = Transformer(mapper).visit(fold) - processed.append(rebuilt) + # Finish unrolling any previously folded Iterations + processed.append(unfold_blocked_tree(rebuilt)) # All blocked dimensions if not blocked: @@ -254,11 +251,11 @@ def decorate(nodes): aligned = [] if aligned: simd = omplang['simd-for-aligned'] - simd = simd(','.join([j.name for j in aligned]), - simdinfo[get_simd_flag()]) + simd = as_tuple(simd(','.join([j.name for j in aligned]), + simdinfo[get_simd_flag()])) else: - simd = omplang['simd-for'] - mapper[i] = List(ignore_deps + as_tuple(simd), i) + simd = as_tuple(omplang['simd-for']) + mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed.append(Transformer(mapper).visit(node)) return processed @@ -276,13 +273,14 @@ def _ompize(self, state, **kwargs): # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) - mapper = {i: List(c.Comment('DLE: moved denormals flag')) for i in denormals} + mapper = OrderedDict([(i, None) for i in denormals]) # Handle parallelizable loops + pvt = [] for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable - candidates = filter_iterations(tree, key=key, stop='consecutive') + candidates = filter_iterations(tree, key=key, stop='any') if not candidates: continue @@ -292,15 +290,28 @@ def _ompize(self, state, **kwargs): nparallel = len(candidates) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: - parallelism = omplang['for'] + parallel = omplang['for'] else: - parallelism = omplang['collapse'](nparallel) + parallel = omplang['collapse'](nparallel) root = candidates[0] - mapper[root] = Block(header=omplang['par-region'], - body=denormals + [Element(parallelism), root]) + mapper[root] = root._rebuild(pragmas=root.pragmas + as_tuple(parallel)) - processed.append(Transformer(mapper).visit(node)) + # Track the thread-private and thread-shared variables + pvt += [i for i in FindSymbols('symbolics').visit(tree) if i._mem_stack] + + # Build the parallel region + pvt = sorted(set([i.name for i in pvt])) + pvt = ('private(%s)' % ','.join(pvt)) if pvt else '' + par_region = Block(header=omplang['par-region'](pvt), + body=denormals + [i for i in mapper.values() if i]) + for k, v in list(mapper.items()): + if v is not None: + mapper[k] = None if v.is_Remainder else par_region + + handle = Transformer(mapper).visit(node) + if handle is not None: + processed.append(handle) return {'nodes': processed} @@ -311,12 +322,12 @@ def _pipeline(self, state): self._avoid_denormals(state) self._loop_fission(state) self._padding(state) - self._create_elemental_functions(state) self._loop_blocking(state) self._simdize(state) self._nontemporal_stores(state) if self.params['openmp'] is True: self._ompize(state) + self._create_elemental_functions(state) @dle_pass def _padding(self, state, **kwargs): diff --git a/devito/dle/backends/basic.py b/devito/dle/backends/basic.py index cae9717419..99dfa2be3d 100644 --- a/devito/dle/backends/basic.py +++ b/devito/dle/backends/basic.py @@ -22,10 +22,10 @@ def _pipeline(self, state): @dle_pass def _avoid_denormals(self, state, **kwargs): """ - Introduce nodes in the Iteration/Expression tree that will generate macros - to avoid computing with denormal numbers. These are normally flushed away - when using SSE-like instruction sets in a complete C program, but when - compiling shared objects specific instructions must instead be inserted. + Introduce nodes in the Iteration/Expression tree that will expand to C + macros telling the CPU to flush denormal numbers in hardware. Denormals + are normally flushed when using SSE-based instruction sets, except when + compiling shared objects. """ return {'nodes': (Denormals(),) + state.nodes, 'includes': ('xmmintrin.h', 'pmmintrin.h')} @@ -33,81 +33,75 @@ def _avoid_denormals(self, state, **kwargs): @dle_pass def _create_elemental_functions(self, state, **kwargs): """ - Move :class:`Iteration` sub-trees to separate functions. + Extract :class:`Iteration` sub-trees and move them into :class:`Function`s. - By default, inner iteration trees are moved. To move different types of - :class:`Iteration`, one can provide a lambda function in ``kwargs['rule']``, - taking as input an iterable of :class:`Iteration` and returning an iterable - of :class:`Iteration` (eg, a subset, the whole iteration tree). + By default, only innermost Iteration objects containing more than + ``self.thresholds['elemental']`` operations are extracted. One can specify a + different extraction rule through the lambda function ``kwargs['rule']``, + which takes as input an iterable of :class:`Iteration`s and returns an + :class:`Iteration` node. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) - rule = kwargs.get('rule', lambda tree: tree[-1:]) + rule = kwargs.get('rule', lambda tree: tree[-1]) functions = [] processed = [] - for i, node in enumerate(state.nodes): + for node in state.nodes: mapper = {} - for j, tree in enumerate(retrieve_iteration_tree(node)): + for tree in retrieve_iteration_tree(node, mode='superset'): if len(tree) <= 1: continue + root = rule(tree) - name = "f_%d_%d" % (i, j) + # Has an identical body been encountered already? + view = root.view + if view in mapper: + mapper[view][1].append(root) + continue - candidate = rule(tree) - root = candidate[0] - expressions = FindNodes(Expression).visit(candidate) + name = "f_%d" % len(functions) # Heuristic: create elemental functions only if more than # self.thresholds['elemental_functions'] operations are present + expressions = FindNodes(Expression).visit(root) ops = estimate_cost([e.expr for e in expressions]) if ops < self.thresholds['elemental'] and not root.is_Elementizable: continue - # Determine the elemental function's arguments ... - already_in_scope = [k.dim for k in candidate] - required = [k for k in FindSymbols(mode='free-symbols').visit(candidate) - if k not in already_in_scope] - required += [as_symbol(k) for k in - set(flatten(k.free_symbols for k in root.bounds_symbolic))] - required = set(required) + # Determine the arguments required by the elemental function + in_scope = [i.dim for i in tree[tree.index(root):]] + required = FindSymbols(mode='free-symbols').visit(root) + for i in FindSymbols('symbolics').visit(root): + required.extend(flatten(j.free_symbols for j in i.symbolic_shape)) + required = set([as_symbol(i) for i in required if i not in in_scope]) + # Add tensor arguments args = [] seen = {e.output for e in expressions if e.is_scalar} - for d in FindSymbols('symbolics').visit(candidate): - # Add a necessary Symbolic object - handle = "(float*) %s" if d.is_SymbolicFunction else "%s_vec" - args.append((handle % d.name, d)) - seen |= {as_symbol(d)} - # Add necessary information related to Dimensions - for k in d.indices: - if k.size is not None: - continue - # Dimension size - size = k.symbolic_size - if size not in seen: - args.append((k.ccode, k)) - seen |= {size} - # Dimension index may be required too - if k in required - seen: - index_arg = (k.name, ScalarFunction(name=k.name, - dtype=np.int32)) - args.append(index_arg) - seen |= {k} - - # Add non-temporary scalars to the elemental function's arguments + for i in FindSymbols('symbolics').visit(root): + if i.is_SymbolicFunction: + handle = "(%s*) %s" % (c.dtype_to_ctype(i.dtype), i.name) + else: + handle = "%s_vec" % i.name + args.append((handle, i)) + seen |= {as_symbol(i)} + # Add scalar arguments handle = filter_sorted(required - seen, key=attrgetter('name')) - args.extend([(k.name, ScalarFunction(name=k.name, dtype=np.int32)) - for k in handle]) + args.extend([(i.name, ScalarFunction(name=i.name, dtype=np.int32)) + for i in handle]) # Track info to transform the main tree call, parameters = zip(*args) - mapper[root] = List(header=noinline, body=FunCall(name, call)) + mapper[view] = (List(header=noinline, body=FunCall(name, call)), [root]) # Produce the new function functions.append(Function(name, root, 'void', parameters, ('static',))) # Transform the main tree - processed.append(Transformer(mapper).visit(node)) + imapper = {} + for v, keys in mapper.values(): + imapper.update({k: v for k in keys}) + processed.append(Transformer(imapper).visit(node)) return {'nodes': processed, 'elemental_functions': functions} diff --git a/devito/dle/backends/common.py b/devito/dle/backends/common.py index 1cdff58971..0364b465f5 100644 --- a/devito/dle/backends/common.py +++ b/devito/dle/backends/common.py @@ -47,11 +47,6 @@ def update(self, nodes=None, elemental_functions=None, arguments=None, self.includes += as_tuple(includes) self.flags += as_tuple(flags) - @property - def has_applied_nontemporal_stores(self): - """True if nontemporal stores will be generated, False otherwise.""" - return 'ntstores' in self.flags - @property def has_applied_blocking(self): """True if loop blocking was applied, False otherwise.""" @@ -67,8 +62,6 @@ class Arg(object): """A DLE-produced argument.""" - from_Blocking = False - def __init__(self, argument, value): self.argument = argument self.value = value @@ -79,8 +72,6 @@ def __repr__(self): class BlockingArg(Arg): - from_Blocking = True - def __init__(self, blocked_dim, iteration, value): """ Represent an argument introduced in the kernel by Rewriter._loop_blocking. @@ -107,7 +98,7 @@ class AbstractRewriter(object): Transform Iteration/Expression trees to generate high performance C. This is just an abstract class. Actual transformers should implement the - abstract method ``_pipeline``, that is a sequence of AST transformations. + abstract method ``_pipeline``, which performs a sequence of AST transformations. """ __metaclass__ = abc.ABCMeta @@ -146,19 +137,17 @@ def _analyze(self, state): Analyze the Iteration/Expression trees in ``state.nodes`` to detect information useful to the subsequent DLE passes. - In particular, the presence of fully-parallel or "outermost-sequential - inner-parallel" (OSIP) :class:`Iteration` trees is tracked. In an OSIP - :class:`Iteration` tree, outermost :class:`Iteration` objects represent - an inherently sequential dimension, whereas all inner :class:`Iteration` - objects represent parallelizable dimensions. + In particular, fully-parallel or "outermost-sequential inner-parallel" + (OSIP) :class:`Iteration` trees are searched tracked. In an OSIP + :class:`Iteration` tree, the outermost :class:`Iteration` represents + a sequential dimension, whereas all inner :class:`Iteration` objects + represent parallel dimensions. """ - nodes = state.nodes sections = FindSections().visit(nodes) - trees = sections.keys() - candidate = max(trees, key=lambda i: len(i)) - candidates = [i for i in trees if len(i) == len(candidate)] + candidate = max(list(sections), key=lambda i: len(i)) + candidates = [i for i in sections if len(i) == len(candidate)] # The analysis below may return "false positives" (ie, absence of fully- # parallel or OSIP trees when this is actually false), but this should @@ -183,29 +172,35 @@ def _analyze(self, state): if not has_parallel_dimension: continue - # Is the Iteration tree fully-parallel or OSIP? + # Determine if fully-parallel (FP), OSIP, unit-stride (in innermost dim) + is_FP = True is_OSIP = False + is_US = True for e1 in exprs: lhs = e1.lhs if lhs.is_Symbol: continue for e2 in exprs: handle = [i for i in terms[e2] if as_symbol(i) == as_symbol(lhs)] - if any(lhs.indices[0] != i.indices[0] for i in handle): - is_OSIP = True - break + is_FP &= len(handle) == 0 + is_OSIP |= any(lhs.indices[0] != i.indices[0] for i in handle) + is_US &= all(lhs.indices[-1] == i.indices[-1] for i in handle) + + # Is the innermost Iteration vectorizable? + is_Vectorizable = is_FP or is_OSIP or is_US # Track the discovered properties if is_OSIP: mapper.setdefault(tree[0], []).append('sequential') - for i in tree[is_OSIP:-1]: + for i in tree[is_OSIP:]: mapper.setdefault(i, []).append('parallel') - mapper.setdefault(tree[-1], []).extend(['parallel', 'vector-dim']) + if is_Vectorizable: + mapper.setdefault(tree[-1], []).append('vector-dim') # Introduce the discovered properties in the Iteration/Expression tree for k, v in list(mapper.items()): args = k.args - # 'sequential' has obviously precedence over 'parallel' + # 'sequential' kills 'parallel' properties = ('sequential',) if 'sequential' in v else tuple(v) properties = as_tuple(args.pop('properties')) + properties mapper[k] = Iteration(properties=properties, **args) diff --git a/devito/dle/backends/utils.py b/devito/dle/backends/utils.py index c33d22f687..d5c504ec5a 100644 --- a/devito/dle/backends/utils.py +++ b/devito/dle/backends/utils.py @@ -9,7 +9,7 @@ omplang = { 'for': c.Pragma('omp for schedule(static)'), 'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i), - 'par-region': c.Pragma('omp parallel'), + 'par-region': lambda i: c.Pragma('omp parallel %s' % i), 'par-for': c.Pragma('omp parallel for schedule(static)'), 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)) diff --git a/devito/dle/blocking_utils.py b/devito/dle/blocking_utils.py new file mode 100644 index 0000000000..f343f7f684 --- /dev/null +++ b/devito/dle/blocking_utils.py @@ -0,0 +1,229 @@ +import cgen as c +from sympy import Eq, Symbol +import numpy as np + +from devito.dle import compose_nodes, is_foldable, retrieve_iteration_tree +from devito.dse import xreplace_indices +from devito.nodes import Expression, Iteration, List, LocalExpression +from devito.visitors import (FindAdjacentIterations, FindNodes, IsPerfectIteration, + NestedTransformer, Transformer) +from devito.tools import as_tuple + +__all__ = ['fold_blockable_tree', 'unfold_blocked_tree'] + + +def fold_blockable_tree(node, exclude_innermost=False): + """ + Create :class:`IterationFold`s from sequences of nested :class:`Iteration`. + """ + found = FindAdjacentIterations().visit(node) + found.pop('seen_iteration') + + mapper = {} + for k, v in found.items(): + for i in v: + # Pre-condition: they all must be perfect iterations + assert len(i) > 1 + if any(not IsPerfectIteration().visit(j) for j in i): + continue + # Only retain consecutive trees having same depth + trees = [retrieve_iteration_tree(j)[0] for j in i] + handle = [] + for j in trees: + if len(j) != len(trees[0]): + break + handle.append(j) + trees = handle + if not trees: + continue + # Check foldability + pairwise_folds = zip(*reversed(trees)) + if any(not is_foldable(j) for j in pairwise_folds): + continue + # Perform folding + for j in pairwise_folds[:-exclude_innermost]: + root, remainder = j[0], j[1:] + folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes) + for i in remainder] + mapper[root] = IterationFold(folds=folds, **root.args) + for k in remainder: + mapper[k] = None + + # Insert the IterationFolds in the Iteration/Expression tree + processed = NestedTransformer(mapper).visit(node) + + return processed + + +def unfold_blocked_tree(node): + """ + Unfold nested :class:`IterationFold`. + + Examples + ======== + Given a section of Iteration/Expression tree as below: :: + + for i = 1 to N-1 // folded + for j = 1 to N-1 // folded + foo1() + + Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: + + for i = 1 to N-1 + for j = 1 to N-1 + foo1() + for i = 2 to N-2 + for j = 2 to N-2 + foo2() + """ + # Search the unfolding candidates + candidates = [] + for tree in retrieve_iteration_tree(node): + handle = tuple(i for i in tree if i.is_IterationFold) + if handle: + # Sanity check + assert IsPerfectIteration().visit(handle[0]) + candidates.append(handle) + + # Perform unfolding + mapper = {} + for tree in candidates: + trees = zip(*[i.unfold() for i in tree]) + trees = optimize_unfolded_tree(trees[:-1], trees[-1]) + trees = [compose_nodes(i) for i in trees] + mapper[tree[0]] = List(body=trees) + + # Insert the unfolded Iterations in the Iteration/Expression tree + processed = Transformer(mapper).visit(node) + + return processed + + +def optimize_unfolded_tree(unfolded, root): + """ + Transform folded trees to reduce the memory footprint. + + Examples + ======== + Given: + + .. code-block:: + for i = 1 to N - 1 # Folded tree + for j = 1 to N - 1 + tmp[i,j] = ... + for i = 2 to N - 2 # Root + for j = 2 to N - 2 + ... = ... tmp[i,j] ... + + The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the + iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the + ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting + iteration tree becomes: + + .. code-block:: + for i = 1 to i_bs + 1 # Folded tree + for j = 1 to j_bs + 1 + i' = i + i_block - 2 + j' = j + j_block - 2 + tmp[i,j] = ... # use i' and j' + for i = i_block to i_block + i_bs # Root + for j = j_block to j_block + j_bs + i' = i - x_block + j' = j - j_block + ... = ... tmp[i',j'] ... + """ + processed = [] + for i, tree in enumerate(unfolded): + otree = [] + stmts = [] + mapper = {} + + # "Shrink" the iteration space + for j in tree: + start, end, incr = j.args['limits'] + otree.append(j._rebuild(limits=[0, end-start, incr])) + index = Symbol('%ss%d' % (j.index, i)) + stmts.append((LocalExpression(Eq(index, j.dim + start), np.int32), + LocalExpression(Eq(index, j.dim - start), np.int32))) + mapper[j.dim] = index + + # Substitute iteration variables within the folded trees + exprs = FindNodes(Expression).visit(otree[-1]) + replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) + subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] + + handle = Transformer(dict(zip(exprs, subs))).visit(otree[-1]) + handle = handle._rebuild(nodes=(zip(*stmts)[0] + handle.nodes)) + processed.append(tuple(otree[:-1]) + (handle,)) + + # Temporary arrays can now be moved to the stack + if all(not j.is_Remainder for j in otree): + shape = tuple(j.bounds_symbolic[1] for j in otree) + for j in subs: + shape += j.output_function.shape[len(otree):] + j.output_function.update(shape=shape, onstack=True) + + # Introduce the new iteration variables within root + candidates = [j.output for j in subs] + exprs = FindNodes(Expression).visit(root[-1]) + replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) + subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] + handle = Transformer(dict(zip(exprs, subs))).visit(root[-1]) + handle = handle._rebuild(nodes=(zip(*stmts)[1] + handle.nodes)) + root = root[:-1] + (handle,) + + return processed + [root] + + +class IterationFold(Iteration): + + """ + An IterationFold is a special :class:`Iteration` object that represents + a sequence of consecutive (in program order) Iterations. In an IterationFold, + all Iterations of the sequence but the so called ``root`` are "hidden"; that is, + they cannot be visited by an Iteration/Expression tree visitor. + + The Iterations in the sequence represented by the IterationFold all have same + dimension and properties. However, their extent is relative to that of the ``root``. + """ + + is_IterationFold = True + + def __init__(self, nodes, dimension, limits, index=None, offsets=None, + properties=None, pragmas=None, folds=None): + super(IterationFold, self).__init__(nodes, dimension, limits, index, + offsets, properties, pragmas) + self.folds = folds + + def __repr__(self): + properties = "" + if self.properties: + properties = "WithProperties[%s]::" % ",".join(self.properties) + length = "Length %d" % len(self.folds) + return "<%sIterationFold %s; %s; %s>" % (properties, self.index, + self.limits, length) + + @property + def ccode(self): + comment = c.Comment('This IterationFold is "hiding" one or more Iterations') + code = super(IterationFold, self).ccode + return c.Module([comment, code]) + + def unfold(self): + """ + Return the corresponding :class:`Iteration` objects from each fold in ``self``. + """ + args = self.args + args.pop('folds') + + # Construct the root Iteration + root = Iteration(**args) + + # Construct the folds + args.pop('nodes') + args.pop('offsets') + start, end, incr = args.pop('limits') + folds = tuple(Iteration(nodes, limits=[start+ofs[0], end+ofs[1], incr], **args) + for ofs, nodes in self.folds) + + return folds + as_tuple(root) diff --git a/devito/dle/inspection.py b/devito/dle/inspection.py index 95cd31eb66..55a976cf72 100644 --- a/devito/dle/inspection.py +++ b/devito/dle/inspection.py @@ -1,9 +1,10 @@ from devito.visitors import FindSections +from devito.tools import as_tuple -__all__ = ['filter_iterations', 'retrieve_iteration_tree'] +__all__ = ['filter_iterations', 'retrieve_iteration_tree', 'is_foldable'] -def retrieve_iteration_tree(node): +def retrieve_iteration_tree(node, mode='normal'): """Return a list of all :class:`Iteration` sub-trees rooted in ``node``. For example, given the Iteration tree: @@ -19,31 +20,75 @@ def retrieve_iteration_tree(node): Return the list: :: [(Iteration i, Iteration j, Iteration k), (Iteration i, Iteration p)] + + :param node: The searched Iteration/Expression tree. + :param mode: Accepted values are 'normal' (default) and 'superset', in which + case iteration trees that are subset of larger iteration trees + are dropped. """ + assert mode in ('normal', 'superset') - return [i for i in FindSections().visit(node).keys() if i] + trees = [i for i in FindSections().visit(node) if i] + if mode == 'normal': + return trees + else: + match = [] + for i in trees: + if any(set(i).issubset(set(j)) for j in trees if i != j): + continue + match.append(i) + return match -def filter_iterations(tree, key=lambda i: i, stop=lambda i: False): +def filter_iterations(tree, key=lambda i: i, stop=lambda: False): """ Given an iterable of :class:`Iteration` objects, return a new list containing all items such that ``key(o)`` is True. This function accepts an optional argument ``stop``. This may be either a - lambda function, specifying a stop criterium, or the special keyword - 'consecutive', which makes the function return as soon as ``key(o)`` - gives False and at least one item has been collected. + lambda function, specifying a stop criterium, or any of the following + special keywords: :: + + * 'any': Return as soon as ``key(o)`` is False and at least one + item has been collected. + * 'asap': Return as soon as at least one item has been collected and + all items for which ``key(o)`` is False have been encountered. + + It is useful to specify a ``stop`` criterium when one is searching the + first Iteration in an Iteration/Expression tree for which a given property + does not hold. """ + assert callable(stop) or stop in ['any', 'asap'] + tree = list(tree) filtered = [] + off = [] - if stop == 'consecutive': + if stop == 'any': stop = lambda: len(filtered) > 0 + elif stop == 'asap': + hits = [i for i in tree if not key(i)] + stop = lambda: len(filtered) > 0 and len(off) == len(hits) for i in tree: if key(i): filtered.append(i) - elif stop(): + else: + off.append(i) + if stop(): break return filtered + + +def is_foldable(nodes): + """ + Return True if the iterable ``nodes`` consists of foldable :class:`Iteration` + objects, False otherwise. + """ + nodes = as_tuple(nodes) + if len(nodes) <= 1 or any(not i.is_Iteration for i in nodes): + return False + main = nodes[0] + return all(i.dim == main.dim and i.limits == main.limits and i.index == main.index + and i.properties == main.properties for i in nodes) diff --git a/devito/dle/manipulation.py b/devito/dle/manipulation.py index 117e42e90c..6c804a1610 100644 --- a/devito/dle/manipulation.py +++ b/devito/dle/manipulation.py @@ -7,7 +7,9 @@ def compose_nodes(nodes, retrieve=False): - """Build an Iteration/Expression tree by nesting the nodes in ``nodes``.""" + """ + Build an Iteration/Expression tree by nesting the nodes in ``nodes``. + """ l = list(nodes) tree = [] @@ -25,10 +27,12 @@ def compose_nodes(nodes, retrieve=False): def copy_arrays(mapper, reverse=False): - """Build an Iteration/Expression tree performing the copy ``k = v``, or + """ + Build an Iteration/Expression tree performing the copy ``k = v``, or ``v = k`` if reverse=True, for each (k, v) in mapper. (k, v) are expected to be of type :class:`IndexedData`. The loop bounds are inferred from - the dimensions used in ``k``.""" + the dimensions used in ``k``. + """ if not mapper: return () diff --git a/devito/dse/clusterizer.py b/devito/dse/clusterizer.py index 01cf4a0e9c..c05828bb22 100644 --- a/devito/dse/clusterizer.py +++ b/devito/dse/clusterizer.py @@ -34,6 +34,20 @@ def is_sparse(self): return not self.is_dense def rebuild(self, exprs): + """ + Build a new cluster with expressions ``exprs`` having same stencil + as ``self``. + """ + return Cluster(exprs, self.stencil) + + def reschedule(self, exprs): + """ + Build a new cluster with expressions ``exprs`` having same stencil + as ``self``. The order of the expressions in the new cluster is such that + self's ordering is honored. + """ + g = temporaries_graph(exprs) + exprs = g.reschedule(self.exprs) return Cluster(exprs, self.stencil) diff --git a/devito/dse/graph.py b/devito/dse/graph.py index a6d45ef1a8..c5801fedd6 100644 --- a/devito/dse/graph.py +++ b/devito/dse/graph.py @@ -174,6 +174,48 @@ def trace(self, key): [(k, v)] + list(queue.items())) return found.values() + def reschedule(self, context): + """ + Starting from the temporaries in ``self``, return a new sequence of + expressions that: :: + + * includes all expressions in ``context`` not appearing in ``self``, and + * is ordered so that the ordering in ``context`` is honored. + + Examples + ======== + Assume that: :: + + * ``self`` has five temporaries ``[t0, t1, t2, e1, e2]``, + * ``t1`` depends on the temporary ``e1``, and ``t2`` depends on ``t1`` + * ``context = [e1, e2]`` + + Then the following sequence is returned ``[t0, e1, t1, t2, e2]``. + + If, instead, we had had everything like before except: :: + + * ``context = [t1, e1, e2]`` + + Then the following sequence is returned ``[t0, t1, t2, e1, e2]``. + That is, in the latter example the original ordering dictated by ``context`` + was honored. + """ + processed = [i.lhs for i in context] + queue = [i for i in self if i not in processed] + while queue: + k = queue.pop(0) + handle = self[k].readby + if all(i in processed for i in handle): + index = min(processed.index(i) for i in handle) + processed.insert(index, k) + else: + # Note: push at the back + queue.append(k) + + processed = [self[i] for i in processed] + + return processed + def time_invariant(self, expr=None): """ Check if ``expr`` is time invariant. ``expr`` may be an expression ``e`` diff --git a/devito/dse/manipulation.py b/devito/dse/manipulation.py index 35dfe34b2d..7a59dbd2dd 100644 --- a/devito/dse/manipulation.py +++ b/devito/dse/manipulation.py @@ -7,14 +7,14 @@ from sympy import Indexed, collect, collect_const, flatten from devito.dse.extended_sympy import Add, Eq, Mul -from devito.dse.inspection import count, estimate_cost +from devito.dse.inspection import count, estimate_cost, retrieve_indexed from devito.dse.graph import temporaries_graph from devito.dse.queries import q_op from devito.interfaces import TensorFunction from devito.tools import as_tuple __all__ = ['collect_nested', 'common_subexprs_elimination', 'freeze_expression', - 'xreplace_constrained', 'promote_scalar_expressions'] + 'xreplace_constrained', 'xreplace_indices', 'promote_scalar_expressions'] def freeze_expression(expr): @@ -105,7 +105,7 @@ def run(expr): def xreplace_constrained(exprs, make, rule, cm=lambda e: True, repeat=False): """ - As opposed to ``xreplace``, which replaces all objects specified in a mapper, + Unlike ``xreplace``, which replaces all objects specified in a mapper, this function replaces all objects satisfying two criteria: :: * The "matching rule" -- a function returning True if a node within ``expr`` @@ -187,6 +187,20 @@ def run(expr): return found + rebuilt, found +def xreplace_indices(exprs, mapper, candidates=None, only_rhs=False): + """ + Create new expressions from ``exprs``, by replacing all index variables + specified in mapper appearing as a tensor index. Only tensors whose symbolic + name appears in ``candidates`` are considered if ``candidates`` is not None. + """ + get = lambda i: i.rhs if only_rhs is True else i + handle = flatten(retrieve_indexed(get(i)) for i in as_tuple(exprs)) + if candidates is not None: + handle = [i for i in handle if i.base.label in candidates] + mapper = dict(zip(handle, [i.xreplace(mapper) for i in handle])) + return [i.xreplace(mapper) for i in exprs] + + def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common subexpressions elimination. diff --git a/devito/dse/symbolics.py b/devito/dse/symbolics.py index 511e61ce4a..471751bf74 100644 --- a/devito/dse/symbolics.py +++ b/devito/dse/symbolics.py @@ -176,7 +176,7 @@ def _extract_time_varying(self, cluster, **kwargs): processed, _ = xreplace_constrained(cluster.exprs, make, rule, cm) - return cluster.rebuild(processed) + return cluster.reschedule(processed) @dse_pass def _extract_time_invariants(self, cluster, **kwargs): @@ -202,7 +202,7 @@ def _extract_time_invariants(self, cluster, **kwargs): found = common_subexprs_elimination(found, make) - return cluster.rebuild(found + leaves) + return cluster.reschedule(found + leaves) @dse_pass def _eliminate_intra_stencil_redundancies(self, cluster, **kwargs): @@ -219,7 +219,7 @@ def _eliminate_intra_stencil_redundancies(self, cluster, **kwargs): processed = common_subexprs_elimination(candidates, make) - return cluster.rebuild(skip + processed) + return cluster.reschedule(skip + processed) @dse_pass def _factorize(self, cluster, **kwargs): @@ -302,7 +302,7 @@ def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices - shape = g.space_shape + shape = tuple(i.symbolic_size for i in indices) # Template for captured redundancies name = self.conventions['redundancy'] + "%d" @@ -328,13 +328,14 @@ def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): rules = OrderedDict() stencils = [] for c, (origin, alias) in enumerate(aliases.items()): - temporary = Indexed(template(c), *indices) + function = template(c) + temporary = Indexed(function, *indices) found.append(Eq(temporary, origin)) # Track the stencil of each TensorFunction introduced stencils.append(alias.anti_stencil.anti(cluster.stencil)) for aliased, distance in alias.with_distance: coordinates = [sum([i, j]) for i, j in distance.items() if i in indices] - rules[candidates[aliased]] = Indexed(template(c), *tuple(coordinates)) + rules[candidates[aliased]] = Indexed(function, *tuple(coordinates)) # Create the alias clusters alias_clusters = clusterize(found, stencils) diff --git a/devito/exceptions.py b/devito/exceptions.py index abbe5e9ad0..3750ce99bc 100644 --- a/devito/exceptions.py +++ b/devito/exceptions.py @@ -14,6 +14,10 @@ class StencilOperationError(Exception): pass +class VisitorException(Exception): + pass + + class DSEException(Exception): pass diff --git a/devito/interfaces.py b/devito/interfaces.py index 892c69be99..bd2305c2a8 100644 --- a/devito/interfaces.py +++ b/devito/interfaces.py @@ -118,6 +118,12 @@ def __new__(cls, *args, **kwargs): options = kwargs.get('options', {}) newobj = Function.__new__(newcls, *args, **options) newobj.__init__(*args, **kwargs) + + # All objects cached on the AbstractSymbol /newobj/ keep a reference + # to /newobj/ through the /function/ field. Thus, all indexified + # object will point to /newobj/, the "actual Function". + newobj.function = newobj + # Store new instance in symbol cache newcls._cache_put(newobj) return newobj @@ -135,7 +141,26 @@ def dim(self): @property def indexed(self): """Extract a :class:`IndexedData` object from the current object.""" - return IndexedData(self.name, shape=self.shape, function=self) + return IndexedData(self.name, shape=self.shape, function=self.function) + + @property + def symbolic_shape(self): + """ + Return the symbolic shape of the object. For an entry ``E`` in ``self.shape``, + there are two possibilities: :: + + * ``E`` is already in symbolic form, then simply use ``E``. + * ``E`` is an integer representing the size along a dimension ``D``, + then, use a symbolic representation of ``D``. + """ + sshape = [] + for i, j in zip(self.shape, self.indices): + try: + i.is_algebraic_expr() + sshape.append(i) + except AttributeError: + sshape.append(j.symbolic_size) + return tuple(sshape) @property def _mem_external(self): @@ -165,7 +190,13 @@ def indexify(self): class SymbolicFunction(AbstractSymbol): - """A symbolic function object.""" + + """ + A symbolic function object, created and managed directly by Devito. + + Unlike :class:`SymbolicData` objects, the state of a SymbolicFunction + is mutable. + """ is_SymbolicFunction = True @@ -173,6 +204,9 @@ def __new__(cls, *args, **kwargs): kwargs.update({'options': {'evaluate': False}}) return AbstractSymbol.__new__(cls, *args, **kwargs) + def update(self): + return + class ScalarFunction(SymbolicFunction): """Symbolic object representing a scalar. @@ -196,6 +230,9 @@ def _mem_stack(self): in a C module, False otherwise.""" return True + def update(self, dtype=None): + self.dtype = dtype or self.dtype + class TensorFunction(SymbolicFunction): """Symbolic object representing a tensor. @@ -215,7 +252,7 @@ def __init__(self, *args, **kwargs): self.shape = kwargs.get('shape') self.indices = kwargs.get('dimensions') self.dtype = kwargs.get('dtype', np.float32) - self.onstack = kwargs.get('onstack', False) + self._onstack = kwargs.get('onstack', False) @classmethod def _indices(cls, **kwargs): @@ -223,11 +260,17 @@ def _indices(cls, **kwargs): @property def _mem_stack(self): - return self.onstack + return self._onstack @property def _mem_heap(self): - return not self.onstack + return not self._onstack + + def update(self, dtype=None, shape=None, dimensions=None, onstack=None): + self.dtype = dtype or self.dtype + self.shape = shape or self.shape + self.indices = dimensions or self.indices + self._onstack = onstack or self._mem_stack class SymbolicData(AbstractSymbol): diff --git a/devito/nodes.py b/devito/nodes.py index 21038d732b..d3984cc360 100644 --- a/devito/nodes.py +++ b/devito/nodes.py @@ -15,8 +15,8 @@ from devito.stencil import Stencil from devito.tools import as_tuple, filter_ordered -__all__ = ['Node', 'Block', 'Expression', 'Function', 'Iteration', 'List', - 'TimedList'] +__all__ = ['Node', 'Block', 'Denormals', 'Expression', 'Function', 'FunCall', + 'Iteration', 'List', 'LocalExpression', 'TimedList'] class Node(object): @@ -24,6 +24,7 @@ class Node(object): is_Node = True is_Block = False is_Iteration = False + is_IterationFold = False is_Expression = False is_Function = False is_FunCall = False @@ -58,6 +59,14 @@ def ccode(self): """Generate C code.""" raise NotImplementedError() + @property + def view(self): + """ + Generate a representation of the Iteration/Expression tree rooted in ``self``. + """ + from devito.visitors import printAST + return printAST(self) + @property def children(self): """Return the traversable children.""" @@ -236,11 +245,9 @@ def stencil(self): class Iteration(Node): - """Iteration object that encapsualtes a single loop over nodes, possibly - just SymPy expressions. + """Iteration object that encapsualtes a single loop over nodes. - :param nodes: Single or list of :class:`Node` objects that - define the loop body. + :param nodes: Single or list of :class:`Node` objects defining the loop body. :param dimension: :class:`Dimension` object over which to iterate. :param limits: Limits for the iteration space, either the loop size or a tuple of the form (start, finish, stepping). @@ -249,6 +256,7 @@ class Iteration(Node): :param properties: A bag of strings indicating properties of this Iteration. For example, the string 'parallel' may be used to identify a parallelizable Iteration. + :param pragmas: A bag of pragmas attached to this Iteration. """ is_Iteration = True @@ -263,11 +271,12 @@ class Iteration(Node): executed in parallel. * vector-dim: A (SIMD) vectorizable iteration space. * elemental: Hoistable to an elemental function. + * remainder: A remainder iteration (e.g., by-product of some transformations) """ - _known_properties = ['sequential', 'parallel', 'vector-dim', 'elemental'] + _known_properties = ['sequential', 'parallel', 'vector-dim', 'elemental', 'remainder'] def __init__(self, nodes, dimension, limits, index=None, offsets=None, - properties=None): + properties=None, pragmas=None): # Ensure we deal with a list of Expression objects internally nodes = as_tuple(nodes) self.nodes = as_tuple([n if isinstance(n, Node) else Expression(n) @@ -297,9 +306,10 @@ def __init__(self, nodes, dimension, limits, index=None, offsets=None, self.offsets[0] = min(self.offsets[0], int(off)) self.offsets[1] = max(self.offsets[1], int(off)) - # Track this Iteration's properties + # Track this Iteration's properties and pragmas self.properties = as_tuple(properties) assert (i in Iteration._known_properties for i in self.properties) + self.pragmas = as_tuple(pragmas) def __repr__(self): properties = "" @@ -346,7 +356,10 @@ def ccode(self): loop_cond = '%s < %s' % (self.index, ccode(end)) loop_inc = '%s += %s' % (self.index, self.limits[2]) - return c.For(loop_init, loop_cond, loop_inc, c.Block(loop_body)) + handle = c.For(loop_init, loop_cond, loop_inc, c.Block(loop_body)) + if self.pragmas: + handle = c.Module(self.pragmas + (handle,)) + return handle @property def is_Open(self): @@ -372,6 +385,10 @@ def is_Vectorizable(self): def is_Elementizable(self): return 'elemental' in self.properties + @property + def is_Remainder(self): + return 'remainder' in self.properties + @property def bounds_symbolic(self): """Return a 2-tuple representing the symbolic bounds of the object.""" @@ -481,7 +498,8 @@ def _ccasts(self): alignment = "__attribute__((aligned(64)))" handle = [f for f in self.parameters if isinstance(f, (SymbolicData, TensorFunction))] - shapes = [(f, ''.join(["[%s]" % i.ccode for i in f.indices[1:]])) for f in handle] + shapes = [(f, ''.join(["[%s]" % ccode(i) for i in f.symbolic_shape[1:]])) + for f in handle] casts = [c.Initializer(c.POD(v.dtype, '(*restrict %s)%s %s' % (v.name, shape, alignment)), '(%s (*)%s) %s' % (c.dtype_to_ctype(v.dtype), diff --git a/devito/operator.py b/devito/operator.py index 8d342c3ee3..b407c46b6c 100644 --- a/devito/operator.py +++ b/devito/operator.py @@ -1,32 +1,29 @@ from __future__ import absolute_import import operator -from collections import OrderedDict, namedtuple +from collections import OrderedDict from ctypes import c_int -from functools import reduce import cgen as c import numpy as np import sympy from devito.autotuning import autotune -from devito.cgen_utils import Allocator, blankline +from devito.cgen_utils import Allocator, blankline, printmark from devito.compiler import jit_compile, load from devito.dimension import Dimension, time -from devito.dle import compose_nodes, filter_iterations, transform -from devito.dse import (clusterize, estimate_cost, estimate_memory, indexify, - rewrite, q_indexed) +from devito.dle import (compose_nodes, filter_iterations, + retrieve_iteration_tree, transform) +from devito.dse import clusterize, indexify, rewrite, q_indexed from devito.interfaces import SymbolicData, Forward, Backward from devito.logger import bar, error, info -from devito.nodes import (Element, Expression, Function, Iteration, List, - LocalExpression, TimedList) +from devito.nodes import Element, Expression, Function, Iteration, List, LocalExpression from devito.parameters import configuration -from devito.profiling import Profiler +from devito.profiling import Profiler, create_profile from devito.stencil import Stencil from devito.tools import as_tuple, filter_ordered, flatten -from devito.visitors import (FindNodes, FindSections, FindSymbols, FindScopes, - IsPerfectIteration, ResolveIterationVariable, - SubstituteExpression, Transformer) +from devito.visitors import (FindSymbols, FindScopes, ResolveIterationVariable, + SubstituteExpression, Transformer, NestedTransformer) from devito.exceptions import InvalidArgument, InvalidOperator __all__ = ['Operator'] @@ -98,8 +95,7 @@ def __init__(self, expressions, **kwargs): nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure - self.sections = OrderedDict() - nodes = self._profile_sections(nodes) + nodes, self.profiler = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) @@ -315,13 +311,7 @@ def _extra_arguments(self): def _profile_sections(self, nodes): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" - return List(body=nodes) - - def _profile_summary(self, dim_sizes): - """ - Produce a summary of the performance achieved - """ - return PerformanceSummary() + return List(body=nodes), Profiler() def _autotune(self, arguments): """Use auto-tuning on this Operator to determine empirically the @@ -373,7 +363,7 @@ def _schedule_expressions(self, clusters, ordering): # No Iterations are needed processed.extend(expressions) - return processed + return List(body=processed) def _insert_declarations(self, dle_state, parameters): """Populate the Operator's body with the required array and @@ -403,7 +393,7 @@ def _insert_declarations(self, dle_state, parameters): elif k.output_function._mem_stack: # On the stack, as established by the DLE key = lambda i: i.dim not in k.output_function.indices - site = filter_iterations(v, key=key, stop='consecutive') + site = filter_iterations(reversed(v), key=key, stop='asap') or [nodes] allocator.push_stack(site[-1], k.output_function) else: # On the heap, as a tensor that must be globally accessible @@ -411,11 +401,14 @@ def _insert_declarations(self, dle_state, parameters): # Introduce declarations on the stack for k, v in allocator.onstack: - allocs = as_tuple([Element(i) for i in v]) - mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen) - nodes = Transformer(mapper).visit(nodes) + mapper[k] = tuple(Element(i) for i in v) + nodes = NestedTransformer(mapper).visit(nodes) elemental_functions = Transformer(mapper).visit(dle_state.elemental_functions) + # TODO + # Use x_block_size+2, y_block_size+2 instead of x_size, y_size as shape of + # the tensor functions + # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) @@ -489,10 +482,6 @@ class OperatorCore(OperatorBasic): C code evaluating stencil expressions, can also execute the computation. """ - def __init__(self, expressions, **kwargs): - self.profiler = Profiler() - super(OperatorCore, self).__init__(expressions, **kwargs) - def __call__(self, *args, **kwargs): self.apply(*args, **kwargs) @@ -505,7 +494,7 @@ def apply(self, *args, **kwargs): self.cfunction(*list(arguments.values())) # Output summary of performance achieved - summary = self._profile_summary(dim_sizes) + summary = self.profiler.summary(dim_sizes, self.dtype) with bar(): for k, v in summary.items(): name = '%s<%s>' % (k, ','.join('%d' % i for i in v.itershape)) @@ -526,61 +515,7 @@ def _arg_shape(self, argument): def _profile_sections(self, nodes): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" - mapper = {} - for node in nodes: - for itspace in FindSections().visit(node).keys(): - for i in itspace: - if IsPerfectIteration().visit(i): - # Insert `TimedList` block. This should come from - # the profiler, but we do this manually for now. - lname = 'loop_%s_%d' % (i.index, len(mapper)) - mapper[i] = TimedList(gname=self.profiler.varname, - lname=lname, body=i) - self.profiler.add(lname) - - # Estimate computational properties of the timed section - # (operational intensity, memory accesses) - expressions = FindNodes(Expression).visit(i) - ops = estimate_cost([e.expr for e in expressions]) - memory = estimate_memory([e.expr for e in expressions]) - self.sections[itspace] = Profile(lname, ops, memory) - break - processed = Transformer(mapper).visit(List(body=nodes)) - return processed - - def _profile_summary(self, dim_sizes): - """ - Produce a summary of the performance achieved - """ - summary = PerformanceSummary() - for itspace, profile in self.sections.items(): - dims = {i: i.dim.parent if i.dim.is_Buffered else i.dim for i in itspace} - - # Time - time = self.profiler.timings[profile.timer] - - # Flops - itershape = [i.extent(finish=dim_sizes.get(dims[i].name)) for i in itspace] - iterspace = reduce(operator.mul, itershape) - flops = float(profile.ops*iterspace) - gflops = flops/10**9 - - # Compulsory traffic - datashape = [i.dim.size or dim_sizes[dims[i].name] for i in itspace] - dataspace = reduce(operator.mul, datashape) - traffic = profile.memory*dataspace*self.dtype().itemsize - - # Derived metrics - oi = flops/traffic - gflopss = gflops/time - - # Keep track of performance achieved - summary.setsection(profile.timer, time, gflopss, oi, itershape, datashape) - - # Rename the most time consuming section as 'main' - summary['main'] = summary.pop(max(summary, key=summary.get)) - - return summary + return create_profile(nodes) def _extra_arguments(self): return OrderedDict([(self.profiler.typename, self.profiler.setup())]) @@ -606,11 +541,40 @@ def _cglobals(self): return [self.profiler.ctype, blankline] +class OperatorDebug(OperatorCore): + """ + Decorate the generated code with useful print statements. + """ + + def __init__(self, expressions, **kwargs): + super(OperatorDebug, self).__init__(expressions, **kwargs) + self._includes.append('stdio.h') + + # Minimize the trip count of the sequential loops + iterations = set(flatten(retrieve_iteration_tree(self.body))) + mapper = {i: i._rebuild(limits=(max(i.offsets) + 2)) + for i in iterations if i.is_Sequential} + self.body = Transformer(mapper).visit(self.body) + + # Mark entry/exit points of each non-sequential Iteration tree in the body + iterations = [filter_iterations(i, lambda i: not i.is_Sequential, 'any') + for i in retrieve_iteration_tree(self.body)] + iterations = [i[0] for i in iterations if i] + mapper = {t: List(header=printmark('In nest %d' % i), body=t) + for i, t in enumerate(iterations)} + self.body = Transformer(mapper).visit(self.body) + + class Operator(object): def __new__(cls, *args, **kwargs): - # What type of Operator should I return ? - cls = OperatorForeign if kwargs.pop('external', False) else OperatorCore + # What type of Operator should I create ? + if kwargs.pop('external', False): + cls = OperatorForeign + elif kwargs.pop('debug', False): + cls = OperatorDebug + else: + cls = OperatorCore # Trigger instantiation obj = cls.__new__(cls, *args, **kwargs) @@ -618,34 +582,6 @@ def __new__(cls, *args, **kwargs): return obj -# Helpers for performance tracking - -PerfEntry = namedtuple('PerfEntry', 'time gflopss oi itershape datashape') -"""A helper to return structured performance data.""" - - -class PerformanceSummary(OrderedDict): - - """ - A special dictionary to track and view performance data. - """ - - def setsection(self, key, time, gflopss, oi, itershape, datashape): - self[key] = PerfEntry(time, gflopss, oi, itershape, datashape) - - @property - def gflopss(self): - return OrderedDict([(k, v.gflopss) for k, v in self.items()]) - - @property - def oi(self): - return OrderedDict([(k, v.oi) for k, v in self.items()]) - - @property - def timings(self): - return OrderedDict([(k, v.time) for k, v in self.items()]) - - # Misc helpers cnames = { @@ -654,9 +590,6 @@ def timings(self): } """A dict of standard names to be used for code generation.""" -Profile = namedtuple('Profile', 'timer ops memory') -"""A helper to track profiled sections of code.""" - def set_dle_mode(mode): """ diff --git a/devito/profiling.py b/devito/profiling.py index 487c7b725b..214406f29c 100644 --- a/devito/profiling.py +++ b/devito/profiling.py @@ -1,6 +1,89 @@ +from __future__ import absolute_import + +import operator +from collections import OrderedDict, namedtuple +from functools import reduce + from ctypes import Structure, byref, c_double from cgen import Struct, Value +from devito.dse import estimate_cost, estimate_memory +from devito.nodes import Expression, TimedList +from devito.visitors import IsPerfectIteration, FindSections, FindNodes, Transformer + +__all__ = ['Profile', 'create_profile'] + + +def create_profile(node): + """ + Create a :class:`Profiler` for the Iteration/Expression tree ``node``. + The following code sections are profiled: :: + + * The whole ``node``; + * A sequence of perfectly nested loops that have common :class:`Iteration` + dimensions, but possibly different extent. For example: :: + + for x = 0 to N + .. + for x = 1 to N-1 + .. + + Both Iterations have dimension ``x``, and will be profiled as a single + section, though their extent is different. + * Any perfectly nested loops. + """ + profiler = Profiler() + + # Group by root Iteration + mapper = OrderedDict() + for itspace in FindSections().visit(node): + mapper.setdefault(itspace[0], []).append(itspace) + + # Group sections if their iteration spaces overlap + key = lambda itspace: set([i.dim for i in itspace]) + found = [] + for v in mapper.values(): + queue = list(v) + handle = [] + while queue: + item = queue.pop(0) + if not handle or key(item) == key(handle[0]): + handle.append(item) + else: + # Found a timing section + found.append(tuple(handle)) + handle = [item] + if handle: + found.append(tuple(handle)) + + # Create and track C-level timers + mapper = OrderedDict() + for i, group in enumerate(found): + name = 'section_%d' % i + section, remainder = group[0], group[1:] + + index = len(section) > 1 and not IsPerfectIteration().visit(section[0]) + root = section[index] + + # Prepare to transform the Iteration/Expression tree + body = tuple(j[index] for j in group) + mapper[root] = TimedList(gname=profiler.varname, lname=name, body=body) + for j in remainder: + mapper[j[index]] = None + + # Estimate computational properties of the profiled section + expressions = FindNodes(Expression).visit(body) + ops = estimate_cost([e.expr for e in expressions]) + memory = estimate_memory([e.expr for e in expressions]) + + # Keep track of the new profiled section + profiler.add(name, section, ops, memory) + + # Transform the Iteration/Expression tree introducing the C-level timers + processed = Transformer(mapper).visit(node) + + return processed, profiler + class Profiler(object): @@ -12,22 +95,20 @@ class Profiler(object): typename = "profiler" def __init__(self): - self._timers = [] + # To be populated as new sections are tracked + self._sections = OrderedDict() self._C_timings = None - def add(self, name): + def add(self, name, section, ops, memory): """ - Add a C-level timer. - """ - self._timers.append(name) + Add a profiling section. - @property - def ctype(self): + :param name: The name which uniquely identifies the profiled code section. + :param section: The code section, represented as a tuple of :class:`Iteration`s. + :param ops: The number of floating-point operations in the section. + :param memory: The memory traffic in the section, as bytes moved from/to memory. """ - Returns a :class:`cgen.Struct` relative to the profiler. - """ - return Struct(Profiler.typename, - [Value('double', n) for n in self._timers]) + self._sections[section] = Profile(name, ops, memory) def setup(self): """ @@ -35,10 +116,51 @@ def setup(self): all timers added to ``self`` through ``self.add(...)``. """ cls = type("Timings", (Structure,), - {"_fields_": [(n, c_double) for n in self._timers]}) + {"_fields_": [(i.name, c_double) for i in self._sections.values()]}) self._C_timings = cls() return byref(self._C_timings) + def summary(self, dim_sizes, dtype): + """ + Return a summary of the performance numbers measured. + + :param dim_sizes: The run-time extent of each :class:`Iteration` tracked + by this Profiler. Used to compute the operational intensity + and the perfomance achieved in GFlops/s. + :param dtype: The data type of the objects in the profiled sections. Used + to compute the operational intensity. + """ + + summary = PerformanceSummary() + for itspace, profile in self._sections.items(): + dims = {i: i.dim.parent if i.dim.is_Buffered else i.dim for i in itspace} + + # Time + time = self.timings[profile.name] + + # Flops + itershape = [i.extent(finish=dim_sizes.get(dims[i].name)) for i in itspace] + iterspace = reduce(operator.mul, itershape) + flops = float(profile.ops*iterspace) + gflops = flops/10**9 + + # Compulsory traffic + datashape = [i.dim.size or dim_sizes[dims[i].name] for i in itspace] + dataspace = reduce(operator.mul, datashape) + traffic = profile.memory*dataspace*dtype().itemsize + + # Derived metrics + oi = flops/traffic + gflopss = gflops/time + + # Keep track of performance achieved + summary.setsection(profile.name, time, gflopss, oi, itershape, datashape) + + # Rename the most time consuming section as 'main' + summary['main'] = summary.pop(max(summary, key=summary.get)) + + return summary + @property def timings(self): """ @@ -48,3 +170,41 @@ def timings(self): raise RuntimeError("Cannot extract timings with non-finalized Profiler.") return {field: max(getattr(self._C_timings, field), 10**-6) for field, _ in self._C_timings._fields_} + + @property + def ctype(self): + """ + Returns a :class:`cgen.Struct` relative to the profiler. + """ + return Struct(Profiler.typename, + [Value('double', i.name) for i in self._sections.values()]) + + +class PerformanceSummary(OrderedDict): + + """ + A special dictionary to track and quickly access performance data. + """ + + def setsection(self, key, time, gflopss, oi, itershape, datashape): + self[key] = PerfEntry(time, gflopss, oi, itershape, datashape) + + @property + def gflopss(self): + return OrderedDict([(k, v.gflopss) for k, v in self.items()]) + + @property + def oi(self): + return OrderedDict([(k, v.oi) for k, v in self.items()]) + + @property + def timings(self): + return OrderedDict([(k, v.time) for k, v in self.items()]) + + +Profile = namedtuple('Profile', 'name ops memory') +"""Metadata for a profiled code section.""" + + +PerfEntry = namedtuple('PerfEntry', 'time gflopss oi itershape datashape') +"""Structured performance data.""" diff --git a/devito/tools.py b/devito/tools.py index 9295af0e59..b5ff9416f9 100644 --- a/devito/tools.py +++ b/devito/tools.py @@ -134,27 +134,6 @@ def convert_dtype_to_ctype(dtype): return conversion_dict[dtype] -def aligned(a, alignment=16): - """Function to align the memmory - - :param a: The given memory - :param alignment: Granularity of alignment, 16 bytes by default - :returns: Reference to the start of the aligned memory - """ - if (a.ctypes.data % alignment) == 0: - return a - - extra = alignment / a.itemsize - buf = np.empty(a.size + extra, dtype=a.dtype) - ofs = (-buf.ctypes.data % alignment) / a.itemsize - aa = buf[ofs:ofs+a.size].reshape(a.shape) - np.copyto(aa, a) - - assert (aa.ctypes.data % alignment) == 0 - - return aa - - def pprint(node, verbose=True): """ Shortcut to pretty print Iteration/Expression trees. diff --git a/devito/visitors.py b/devito/visitors.py index 406be6abd6..0732a85b68 100644 --- a/devito/visitors.py +++ b/devito/visitors.py @@ -7,20 +7,22 @@ from __future__ import absolute_import import inspect -from collections import OrderedDict, defaultdict +from collections import Iterable, OrderedDict, defaultdict from operator import attrgetter import cgen as c from sympy import Symbol from devito.dimension import LoweredDimension +from devito.exceptions import VisitorException from devito.nodes import Iteration, List, Node from devito.tools import as_tuple, filter_ordered, filter_sorted, flatten __all__ = ['FindNodes', 'FindSections', 'FindSymbols', 'FindScopes', 'IsPerfectIteration', 'SubstituteExpression', 'printAST', - 'ResolveIterationVariable', 'Transformer', 'NestedTransformer'] + 'ResolveIterationVariable', 'Transformer', 'NestedTransformer', + 'FindAdjacentIterations'] class Visitor(object): @@ -225,11 +227,15 @@ def default_retval(cls): """ def visit_tuple(self, o, ret=None, queue=None): + if ret is None: + ret = self.default_retval() for i in o: ret = self.visit(i, ret=ret, queue=queue) return ret def visit_Node(self, o, ret=None, queue=None): + if ret is None: + ret = self.default_retval() for i in o.children: ret = self.visit(i, ret=ret, queue=queue) return ret @@ -361,10 +367,60 @@ def visit_Node(self, o, ret=None): return ret +class FindAdjacentIterations(Visitor): + + @classmethod + def default_retval(cls): + return OrderedDict([('seen_iteration', False)]) + + """ + Return a mapper from nodes N in an Expression/Iteration tree to sequences of + :class:`Iteration` objects I = [I_0, I_1, ...], where N is the direct ancestor of + the items in I and all items in I are adjacent nodes in the tree. + """ + + def handler(self, o, parent=None, ret=None): + if ret is None: + ret = self.default_retval() + if parent is None: + return ret + group = [] + for i in o: + ret = self.visit(i, parent=parent, ret=ret) + if ret['seen_iteration'] is True: + group.append(i) + else: + if len(group) > 1: + ret.setdefault(parent, []).append(tuple(group)) + # Reset the group, Iterations no longer adjacent + group = [] + # Potential leftover + if len(group) > 1: + ret.setdefault(parent, []).append(tuple(group)) + return ret + + def visit_object(self, o, parent=None, ret=None): + return ret + + def visit_tuple(self, o, parent=None, ret=None): + return self.handler(o, parent=parent, ret=ret) + + def visit_Node(self, o, parent=None, ret=None): + ret = self.handler(o.children, parent=o, ret=ret) + ret['seen_iteration'] = False + return ret + + def visit_Iteration(self, o, parent=None, ret=None): + ret = self.handler(o.children, parent=o, ret=ret) + ret['seen_iteration'] = True + return ret + + class IsPerfectIteration(Visitor): - """Return True if an :class:`Iteration` defines a perfect loop nest, - False otherwise.""" + """ + Return True if an :class:`Iteration` defines a perfect loop nest, False otherwise. + """ def visit_object(self, o, **kwargs): return False @@ -385,9 +441,15 @@ def visit_Iteration(self, o, found=False, multi=False): class Transformer(Visitor): - """Given an Iteration/Expression tree T and a mapper from nodes in T to + """ + Given an Iteration/Expression tree T and a mapper from nodes in T to a set of new nodes L, M : N --> L, build a new Iteration/Expression tree T' where a node ``n`` in N is replaced with ``M[n]``. + + In the special case in which ``M[n]`` is None, ``n`` is dropped from T'. + + In the special case in which ``M[n]`` is an iterable of nodes, ``n`` is + "extended" by pre-pending to its body the nodes in ``M[n]``. """ def __init__(self, mapper={}): @@ -399,14 +461,24 @@ def visit_object(self, o, **kwargs): return o def visit_tuple(self, o, **kwargs): - return tuple(self.visit(i, **kwargs) for i in o) + visited = tuple(self.visit(i, **kwargs) for i in o) + return tuple(i for i in visited if i is not None) visit_list = visit_tuple def visit_Node(self, o, **kwargs): if o in self.mapper: handle = self.mapper[o] - return handle._rebuild(**handle.args) + if handle is None: + # None -> drop /o/ + return None + elif isinstance(handle, Iterable): + if not o.children: + raise VisitorException + extended = (tuple(handle) + o.children[0],) + o.children[1:] + return o._rebuild(*extended, **o.args_frozen) + else: + return handle._rebuild(**handle.args) else: rebuilt = [self.visit(i, **kwargs) for i in o.children] return o._rebuild(*rebuilt, **o.args_frozen) @@ -419,15 +491,25 @@ def visit(self, o, *args, **kwargs): class NestedTransformer(Transformer): + """ - As opposed to a :class:`Transformer`, a :class:`NestedTransforer` applies + Unlike a :class:`Transformer`, a :class:`NestedTransforer` applies replacements in a depth-first fashion. """ def visit_Node(self, o, **kwargs): rebuilt = [self.visit(i, **kwargs) for i in o.children] handle = self.mapper.get(o, o) - return handle._rebuild(*rebuilt, **handle.args_frozen) + if handle is None: + # None -> drop /o/ + return None + elif isinstance(handle, Iterable): + if not o.children: + raise VisitorException + extended = [tuple(handle) + rebuilt[0]] + rebuilt[1:] + return o._rebuild(*extended, **o.args_frozen) + else: + return handle._rebuild(*rebuilt, **handle.args_frozen) class SubstituteExpression(Transformer): diff --git a/tests/test_dle.py b/tests/test_dle.py index 48b42c0f89..403847f59c 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -148,11 +148,11 @@ def test_create_elemental_functions_simple(simple_function): { for (int j = 0; j < 5; j += 1) { - f_0_0((float*) a,(float*) b,(float*) c,(float*) d,i,j); + f_0((float*) a,(float*) b,(float*) c,(float*) d,i,j); } } } -void f_0_0(float *restrict a_vec, float *restrict b_vec,""" +void f_0(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; @@ -187,15 +187,15 @@ def test_create_elemental_functions_complex(complex_function): float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int i = 0; i < 3; i += 1) { - f_0_0((float*) a,(float*) b,i); + f_0((float*) a,(float*) b,i); for (int j = 0; j < 5; j += 1) { - f_0_1((float*) a,(float*) b,(float*) c,(float*) d,i,j); + f_1((float*) a,(float*) b,(float*) c,(float*) d,i,j); } - f_0_2((float*) a,(float*) b,i); + f_2((float*) a,(float*) b,i); } } -void f_0_0(float *restrict a_vec, float *restrict b_vec, const int i) +void f_0(float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; @@ -204,7 +204,7 @@ def test_create_elemental_functions_complex(complex_function): b[i] = a[i] + pow(b[i], 2) + 3; } } -void f_0_1(float *restrict a_vec, float *restrict b_vec,""" +void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; @@ -217,7 +217,7 @@ def test_create_elemental_functions_complex(complex_function): a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } -void f_0_2(float *restrict a_vec, float *restrict b_vec, const int i) +void f_2(float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; diff --git a/tests/test_operator.py b/tests/test_operator.py index 0e2dc499fd..bb609491f8 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -264,15 +264,15 @@ def test_heap_1D_stencil(self, a, b): assert """\ float (*a); posix_memalign((void**)&a, 64, sizeof(float[3])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { a[i] = a[i] + b[i] + 5.0F; } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); return 0;""" in str(operator.ccode) @@ -283,8 +283,8 @@ def test_heap_perfect_2D_stencil(self, a, c): float (*c)[5]; posix_memalign((void**)&a, 64, sizeof(float[3])); posix_memalign((void**)&c, 64, sizeof(float[3][5])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) @@ -293,9 +293,9 @@ def test_heap_perfect_2D_stencil(self, a, c): c[i][j] = a[i]*c[i][j]; } } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); free(c); return 0;""" in str(operator.ccode) @@ -307,19 +307,19 @@ def test_heap_imperfect_2D_stencil(self, a, c): float (*c)[5]; posix_memalign((void**)&a, 64, sizeof(float[3])); posix_memalign((void**)&c, 64, sizeof(float[3][5])); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { a[i] = 0.0F; - struct timeval start_loop_j_0, end_loop_j_0; - gettimeofday(&start_loop_j_0, NULL); for (int j = 0; j < 5; j += 1) { c[i][j] = a[i]*c[i][j]; } - gettimeofday(&end_loop_j_0, NULL); - timings->loop_j_0 += (double)(end_loop_j_0.tv_sec-start_loop_j_0.tv_sec)\ -+(double)(end_loop_j_0.tv_usec-start_loop_j_0.tv_usec)/1000000; } + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); free(c); return 0;""" in str(operator.ccode) @@ -330,32 +330,32 @@ def test_stack_scalar_temporaries(self, a, t0, t1): assert """\ float (*a); posix_memalign((void**)&a, 64, sizeof(float[3])); - struct timeval start_loop_i_0, end_loop_i_0; - gettimeofday(&start_loop_i_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int i = 0; i < 3; i += 1) { float t0 = 1.00000000000000F; float t1 = 2.00000000000000F; a[i] = 3.0F*t0*t1; } - gettimeofday(&end_loop_i_0, NULL); - timings->loop_i_0 += (double)(end_loop_i_0.tv_sec-start_loop_i_0.tv_sec)\ -+(double)(end_loop_i_0.tv_usec-start_loop_i_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; free(a); return 0;""" in str(operator.ccode) def test_stack_vector_temporaries(self, c_stack, e): operator = Operator([Eq(c_stack, e*1.)], dse='noop', dle='noop') assert """\ - struct timeval start_loop_k_0, end_loop_k_0; - gettimeofday(&start_loop_k_0, NULL); + struct timeval start_section_0, end_section_0; + gettimeofday(&start_section_0, NULL); for (int k = 0; k < 7; k += 1) { for (int s = 0; s < 4; s += 1) { for (int q = 0; q < 4; q += 1) { - double c_stack[3][5] __attribute__((aligned(64))); + float c_stack[3][5] __attribute__((aligned(64))); for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) @@ -366,9 +366,9 @@ def test_stack_vector_temporaries(self, c_stack, e): } } } - gettimeofday(&end_loop_k_0, NULL); - timings->loop_k_0 += (double)(end_loop_k_0.tv_sec-start_loop_k_0.tv_sec)\ -+(double)(end_loop_k_0.tv_usec-start_loop_k_0.tv_usec)/1000000; + gettimeofday(&end_section_0, NULL); + timings->section_0 += (double)(end_section_0.tv_sec-start_section_0.tv_sec)\ ++(double)(end_section_0.tv_usec-start_section_0.tv_usec)/1000000; return 0;""" in str(operator.ccode) @@ -403,7 +403,7 @@ def test_expressions_imperfect_loops(self, ti0, ti1, ti2, t0): assert outer[-1].nodes[0].expr.rhs == eq1.rhs assert inner[-1].nodes[0].expr.rhs == eq2.rhs - def test_different_loop_nests(self, tu, ti0, t0, t1): + def test_different_section_nests(self, tu, ti0, t0, t1): eq1 = Eq(ti0, t0*3.) eq2 = Eq(tu, ti0 + t1*3.) op = Operator([eq1, eq2], dse='noop', dle='noop')