spcl · alexnick83 · Nov 8, 2023 · Aug 2, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/dace/transformation/change_strides.py b/dace/transformation/change_strides.py
@@ -0,0 +1,210 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" This module provides a function to change the stride in a given SDFG """
+from typing import List, Union, Tuple
+import sympy
+
+import dace
+from dace.dtypes import ScheduleType
+from dace.sdfg import SDFG, nodes, SDFGState
+from dace.data import Array, Scalar
+from dace.memlet import Memlet
+
+
+def list_access_nodes(
+        sdfg: dace.SDFG,
+        array_name: str) -> List[Tuple[nodes.AccessNode, Union[SDFGState, dace.SDFG]]]:
+    """
+    Find all access nodes in the SDFG of the given array name. Does not recourse into nested SDFGs.
+
+    :param sdfg: The SDFG to search through
+    :type sdfg: dace.SDFG
+    :param array_name: The name of the wanted array
+    :type array_name: str
+    :return: List of the found access nodes together with their state
+    :rtype: List[Tuple[nodes.AccessNode, Union[dace.SDFGState, dace.SDFG]]]
+    """
+    found_nodes = []
+    for state in sdfg.states():
+        for node in state.nodes():
+            if isinstance(node, nodes.AccessNode) and node.data == array_name:
+                found_nodes.append((node, state))
+    return found_nodes
+
+
+def change_strides(
+        sdfg: dace.SDFG,
+        stride_one_values: List[str],
+        schedule: ScheduleType) -> SDFG:
+    """
+    Change the strides of the arrays on the given SDFG such that the given dimension has stride 1. Returns a new SDFG.
+
+    :param sdfg: The input SDFG
+    :type sdfg: dace.SDFG
+    :param stride_one_values: Length of the dimension whose stride should be set to one. Expects that each array has
+    only one dimension whose length is in this list. Expects that list contains name of symbols
+    :type stride_one_values: List[str]
+    :param schedule: Schedule to use to copy the arrays
+    :type schedule: ScheduleType
+    :return: SDFG with changed strides
+    :rtype: SDFG
+    """
+    # Create new SDFG and copy constants and symbols
+    original_name = sdfg.name
+    sdfg.name = "changed_strides"
+    new_sdfg = SDFG(original_name)
+    for dname, value in sdfg.constants.items():
+        new_sdfg.add_constant(dname, value)
+    for dname, stype in sdfg.symbols.items():
+        new_sdfg.add_symbol(dname, stype)
+
+    changed_stride_state = new_sdfg.add_state("with_changed_strides", is_start_state=True)
+    inputs, outputs = sdfg.read_and_write_sets()
+    # Get all arrays which are persistent == not transient
+    persistent_arrays = {name: desc for name, desc in sdfg.arrays.items() if not desc.transient}
+
+    # Get the persistent arrays of all the transient arrays which get copied to GPU
+    for dname in persistent_arrays:
+        for access, state in list_access_nodes(sdfg, dname):
+            if len(state.out_edges(access)) == 1:
+                edge = state.out_edges(access)[0]
+                if isinstance(edge.dst, nodes.AccessNode):
+                    if edge.dst.data in inputs:
+                        inputs.remove(edge.dst.data)
+                        inputs.add(dname)
+            if len(state.in_edges(access)) == 1:
+                edge = state.in_edges(access)[0]
+                if isinstance(edge.src, nodes.AccessNode):
+                    if edge.src.data in inputs:
+                        outputs.remove(edge.src.data)
+                        outputs.add(dname)
+
+    # Only keep inputs and outputs which are persistent
+    inputs.intersection_update(persistent_arrays.keys())
+    outputs.intersection_update(persistent_arrays.keys())
+    nsdfg = changed_stride_state.add_nested_sdfg(sdfg, new_sdfg, inputs=inputs, outputs=outputs)
+    transform_state = new_sdfg.add_state_before(changed_stride_state, label="transform_data", is_start_state=True)
+    transform_state_back = new_sdfg.add_state_after(changed_stride_state, "transform_data_back", is_start_state=False)
+
+    # copy arrays
+    for dname, desc in sdfg.arrays.items():
+        if not desc.transient:
+            if isinstance(desc, Array):
+                new_sdfg.add_array(dname, desc.shape, desc.dtype, desc.storage,
+                                   desc.location, desc.transient, desc.strides,
+                                   desc.offset)
+            elif isinstance(desc, Scalar):
+                new_sdfg.add_scalar(dname, desc.dtype, desc.storage, desc.transient, desc.lifetime, desc.debuginfo)
+
+    new_order = {}
+    new_strides_map = {}
+
+    # Map of array names in the nested sdfg:  key: array name in parent sdfg (this sdfg), value: name in the nsdfg
+    # Assumes that name changes only appear in the first level of nsdfg nesting
+    array_names_map = {}
+    for graph in sdfg.sdfg_list:
+        if graph.parent_nsdfg_node is not None:
+            if graph.parent_sdfg == sdfg:
+                for connector in graph.parent_nsdfg_node.in_connectors:
+                    for in_edge in graph.parent.in_edges_by_connector(graph.parent_nsdfg_node, connector):
+                        array_names_map[str(connector)] = in_edge.data.data
+
+    for containing_sdfg, dname, desc in sdfg.arrays_recursive():
+        shape_str = [str(s) for s in desc.shape]
+        # Get index of the dimension we want to have stride 1
+        stride_one_idx = None
+        this_stride_one_value = None
+        for dim in stride_one_values:
+            if str(dim) in shape_str:
+                stride_one_idx = shape_str.index(str(dim))
+                this_stride_one_value = dim
+                break
+
+        if stride_one_idx is not None:
+            new_order[dname] = [stride_one_idx]
+
+            new_strides = list(desc.strides)
+            new_strides[stride_one_idx] = sympy.S.One
+
+            previous_size = dace.symbolic.symbol(this_stride_one_value)
+            previous_stride = sympy.S.One
+            for i in range(len(new_strides)):
+                if i != stride_one_idx:
+                    new_order[dname].append(i)
+                    new_strides[i] = previous_size * previous_stride
+                    previous_size = desc.shape[i]
+                    previous_stride = new_strides[i]
+
+            new_strides_map[dname] = {}
+            # Create a map entry for this data linking old strides to new strides. This assumes that each entry in
+            # strides is unique which is given as otherwise there would be two dimension i, j where a[i, j] would point
+            # to the same address as a[j, i]
+            for new_stride, old_stride in zip(new_strides, desc.strides):
+                new_strides_map[dname][old_stride] = new_stride
+            desc.strides = tuple(new_strides)
+        else:
+            parent_name = array_names_map[dname] if dname in array_names_map else dname
+            if parent_name in new_strides_map:
+                new_strides = []
+                for stride in desc.strides:
+                    new_strides.append(new_strides_map[parent_name][stride])
+                desc.strides = new_strides
+
+    # Add new flipped arrays for every non-transient array
+    flipped_names_map = {}
+    for dname, desc in sdfg.arrays.items():
+        if not desc.transient:
+            flipped_name = f"{dname}_flipped"
+            flipped_names_map[dname] = flipped_name
+            new_sdfg.add_array(flipped_name, desc.shape, desc.dtype,
+                               desc.storage, desc.location, True,
+                               desc.strides, desc.offset)
+
+    # Deal with the inputs: Create tasklet to flip them and connect via memlets
+    # for input in inputs:
+    for input in set([*inputs, *outputs]):
+        if input in new_order:
+            flipped_data = flipped_names_map[input]
+            if input in inputs:
+                changed_stride_state.add_memlet_path(changed_stride_state.add_access(flipped_data), nsdfg,
+                                                     dst_conn=input, memlet=Memlet(data=flipped_data))
+            # Simply need to copy the data, the different strides take care of the transposing
+            arr = sdfg.arrays[input]
+            tasklet, map_entry, map_exit = transform_state.add_mapped_tasklet(
+                    name=f"transpose_{input}",
+                    map_ranges={f"_i{i}": f"0:{s}" for i, s in enumerate(arr.shape)},
+                    inputs={'_in': Memlet(data=input, subset=", ".join(f"_i{i}" for i, _ in enumerate(arr.shape)))},
+                    code='_out = _in',
+                    outputs={'_out': Memlet(data=flipped_data,
+                                            subset=", ".join(f"_i{i}" for i, _ in enumerate(arr.shape)))},
+                    external_edges=True,
+                    schedule=schedule,
+                    )
+    # Do the same for the outputs
+    for output in outputs:
+        if output in new_order:
+            flipped_data = flipped_names_map[output]
+            changed_stride_state.add_memlet_path(nsdfg, changed_stride_state.add_access(flipped_data),
+                                                 src_conn=output, memlet=Memlet(data=flipped_data))
+            # Simply need to copy the data, the different strides take care of the transposing
+            arr = sdfg.arrays[output]
+            tasklet, map_entry, map_exit = transform_state_back.add_mapped_tasklet(
+                    name=f"transpose_{output}",
+                    map_ranges={f"_i{i}": f"0:{s}" for i, s in enumerate(arr.shape)},
+                    inputs={'_in': Memlet(data=flipped_data,
+                                          subset=", ".join(f"_i{i}" for i, _ in enumerate(arr.shape)))},
+                    code='_out = _in',
+                    outputs={'_out': Memlet(data=output, subset=", ".join(f"_i{i}" for i, _ in enumerate(arr.shape)))},
+                    external_edges=True,
+                    schedule=schedule,
+                    )
+    # Deal with any arrays which have not been flipped (should only be scalars). Connect them directly
+    for dname, desc in sdfg.arrays.items():
+        if not desc.transient and dname not in new_order:
+            if dname in inputs:
+                changed_stride_state.add_memlet_path(changed_stride_state.add_access(dname), nsdfg, dst_conn=dname,
+                                                     memlet=Memlet(data=dname))
+            if dname in outputs:
+                changed_stride_state.add_memlet_path(nsdfg, changed_stride_state.add_access(dname), src_conn=dname,
+                                                     memlet=Memlet(data=dname))
+
+    return new_sdfg
diff --git a/dace/transformation/dataflow/map_expansion.py b/dace/transformation/dataflow/map_expansion.py
@@ -47,7 +47,7 @@ def apply(self, graph: dace.SDFGState, sdfg: dace.SDFG):
         new_maps = [
             nodes.Map(current_map.label + '_' + str(param), [param],
                       subsets.Range([param_range]),
-                      schedule=dtypes.ScheduleType.Sequential)
+                      schedule=current_map.schedule)
             for param, param_range in zip(current_map.params[1:], current_map.range[1:])
         ]
         current_map.params = [current_map.params[0]]

diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py
@@ -1137,7 +1137,8 @@ def traverse(state: SDFGState, treenode: ScopeTree):
                     ntree.state = nstate
                     treenode.children.append(ntree)
         for child in treenode.children:
-            traverse(getattr(child, 'state', state), child)
+            if hasattr(child, 'state') and child.state != state:
+                traverse(getattr(child, 'state', state), child)
 
     traverse(state, stree)
     return stree

diff --git a/dace/transformation/interstate/__init__.py b/dace/transformation/interstate/__init__.py
@@ -15,3 +15,4 @@
 from .move_loop_into_map import MoveLoopIntoMap
 from .trivial_loop_elimination import TrivialLoopElimination
 from .multistate_inline import InlineMultistateSDFG
+from .move_assignment_outside_if import MoveAssignmentOutsideIf
diff --git a/dace/transformation/interstate/move_assignment_outside_if.py b/dace/transformation/interstate/move_assignment_outside_if.py
@@ -0,0 +1,113 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+""" 
+Transformation to move assignments outside if statements to potentially avoid warp divergence. Speedup gained is
+questionable.
+"""
+
+import ast
+import sympy as sp
+
+from dace import sdfg as sd
+from dace.sdfg import graph as gr
+from dace.sdfg.nodes import Tasklet, AccessNode
+from dace.transformation import transformation
+
+
+class MoveAssignmentOutsideIf(transformation.MultiStateTransformation):
+
+    if_guard = transformation.PatternNode(sd.SDFGState)
+    if_stmt = transformation.PatternNode(sd.SDFGState)
+    else_stmt = transformation.PatternNode(sd.SDFGState)
+
+    @classmethod
+    def expressions(cls):
+        sdfg = gr.OrderedDiGraph()
+        sdfg.add_nodes_from([cls.if_guard, cls.if_stmt, cls.else_stmt])
+        sdfg.add_edge(cls.if_guard, cls.if_stmt, sd.InterstateEdge())
+        sdfg.add_edge(cls.if_guard, cls.else_stmt, sd.InterstateEdge())
+        return [sdfg]
+
+    def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
+        # The if-guard can only have two outgoing edges: to the if and to the else part
+        guard_outedges = graph.out_edges(self.if_guard)
+        if len(guard_outedges) != 2:
+            return False
+
+        # Outgoing edges must be a negation of each other
+        if guard_outedges[0].data.condition_sympy() != (sp.Not(guard_outedges[1].data.condition_sympy())):
+            return False
+
+        # The if guard should either have zero or one incoming edge
+        if len(sdfg.in_edges(self.if_guard)) > 1:
+            return False
+
+        # set of the variables which get a const value assigned
+        assigned_const = set()
+        # Dict which collects all AccessNodes for each variable together with its state
+        access_nodes = {}
+        # set of the variables which are only written to
+        self.write_only_values = set()
+        # Dictionary which stores additional information for the variables which are written only
+        self.assign_context = {}
+        for state in [self.if_stmt, self.else_stmt]:
+            for node in state.nodes():
+                if isinstance(node, Tasklet):
+                    # If node is a tasklet, check if assigns a constant value
+                    assigns_const = True
+                    for code_stmt in node.code.code:
+                        if not (isinstance(code_stmt, ast.Assign) and isinstance(code_stmt.value, ast.Constant)):
+                            assigns_const = False
+                    if assigns_const:
+                        for edge in state.out_edges(node):
+                            if isinstance(edge.dst, AccessNode):
+                                assigned_const.add(edge.dst.data)
+                                self.assign_context[edge.dst.data] = {"state": state, "tasklet": node}
+                elif isinstance(node, AccessNode):
+                    if node.data not in access_nodes:
+                        access_nodes[node.data] = []
+                    access_nodes[node.data].append((node, state))
+
+        # check that the found access nodes only get written to
+        for data, nodes in access_nodes.items():
+            write_only = True
+            for node, state in nodes:
+                if node.has_reads(state):
+                    # The read is only a problem if it is not written before -> the access node has no incoming edge
+                    if state.in_degree(node) == 0:
+                        write_only = False
+                    else:
+                        # There is also a problem if any edge is an update instead of write
+                        for edge in [*state.out_edges(node), *state.out_edges(node)]:
+                            if edge.data.wcr is not None:
+                                write_only = False
+
+            if write_only:
+                self.write_only_values.add(data)
+
+        # Want only the values which are only written to and one option uses a constant value
+        self.write_only_values = assigned_const.intersection(self.write_only_values)
+
+        if len(self.write_only_values) == 0:
+            return False
+        return True
+
+    def apply(self, _, sdfg: sd.SDFG):
+        # create a new state before the guard state where the zero assignment happens
+        new_assign_state = sdfg.add_state_before(self.if_guard, label="const_assignment_state")
+
+        # Move all the Tasklets together with the AccessNode
+        for value in self.write_only_values:
+            state = self.assign_context[value]["state"]
+            tasklet = self.assign_context[value]["tasklet"]
+            new_assign_state.add_node(tasklet)
+            for edge in state.out_edges(tasklet):
+                state.remove_edge(edge)
+                state.remove_node(edge.dst)
+                new_assign_state.add_node(edge.dst)
+                new_assign_state.add_edge(tasklet, edge.src_conn, edge.dst, edge.dst_conn, edge.data)
+
+            state.remove_node(tasklet)
+            # Remove the state if it was emptied
+            if state.is_empty():
+                sdfg.remove_node(state)
+        return sdfg
diff --git a/tests/transformations/change_strides_test.py b/tests/transformations/change_strides_test.py
@@ -0,0 +1,48 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace import nodes
+from dace.dtypes import ScheduleType
+from dace.memlet import Memlet
+from dace.transformation.change_strides import change_strides
+
+
+def change_strides_test():
+    sdfg = dace.SDFG('change_strides_test')
+    N = dace.symbol('N')
+    M = dace.symbol('M')
+    sdfg.add_array('A', [N, M], dace.float64)
+    sdfg.add_array('B', [N, M, 3], dace.float64)
+    state = sdfg.add_state()
+
+    task1, mentry1, mexit1 = state.add_mapped_tasklet(
+            name="map1",
+            map_ranges={'i': '0:N', 'j': '0:M'},
+            inputs={'a': Memlet(data='A', subset='i, j')},
+            outputs={'b': Memlet(data='B', subset='i, j, 0')},
+            code='b = a + 1',
+            external_edges=True,
+            propagate=True)
+
+    # Check that states are as expected
+    changed_sdfg = change_strides(sdfg, ['N'], ScheduleType.Sequential)
+    assert len(changed_sdfg.states()) == 3
+    assert len(changed_sdfg.out_edges(changed_sdfg.start_state)) == 1
+    work_state = changed_sdfg.out_edges(changed_sdfg.start_state)[0].dst
+    nsdfg = None
+    for node in work_state.nodes():
+        if isinstance(node, nodes.NestedSDFG):
+            nsdfg = node
+    # Check shape and strides of data inside nested SDFG
+    assert nsdfg is not None
+    assert nsdfg.sdfg.data('A').shape == (N, M)
+    assert nsdfg.sdfg.data('B').shape == (N, M, 3)
+    assert nsdfg.sdfg.data('A').strides == (1, N)
+    assert nsdfg.sdfg.data('B').strides == (1, N, M*N)
+
+
+def main():
+    change_strides_test()
+
+
+if __name__ == '__main__':
+    main()