Skip to content

Commit

Permalink
Merge branch 'master' into implement-SubsetList-and-covers_precise
Browse files Browse the repository at this point in the history
  • Loading branch information
acalotoiu authored Nov 3, 2023
2 parents ab11b20 + 9430e87 commit 26c17a8
Show file tree
Hide file tree
Showing 19 changed files with 1,784 additions and 583 deletions.
13 changes: 8 additions & 5 deletions dace/codegen/instrumentation/papi.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dace.sdfg.graph import SubgraphView
from dace.memlet import Memlet
from dace.sdfg import scope_contains_scope
from dace.sdfg.state import StateGraphView
from dace.sdfg.state import DataflowGraphView

import sympy as sp
import os
Expand Down Expand Up @@ -392,7 +392,7 @@ def should_instrument_entry(map_entry: EntryNode) -> bool:
return cond

@staticmethod
def has_surrounding_perfcounters(node, dfg: StateGraphView):
def has_surrounding_perfcounters(node, dfg: DataflowGraphView):
""" Returns true if there is a possibility that this node is part of a
section that is profiled. """
parent = dfg.entry_node(node)
Expand Down Expand Up @@ -605,7 +605,7 @@ def get_memlet_byte_size(sdfg: dace.SDFG, memlet: Memlet):
return memlet.volume * memdata.dtype.bytes

@staticmethod
def get_out_memlet_costs(sdfg: dace.SDFG, state_id: int, node: nodes.Node, dfg: StateGraphView):
def get_out_memlet_costs(sdfg: dace.SDFG, state_id: int, node: nodes.Node, dfg: DataflowGraphView):
scope_dict = sdfg.node(state_id).scope_dict()

out_costs = 0
Expand Down Expand Up @@ -636,7 +636,10 @@ def get_out_memlet_costs(sdfg: dace.SDFG, state_id: int, node: nodes.Node, dfg:
return out_costs

@staticmethod
def get_tasklet_byte_accesses(tasklet: nodes.CodeNode, dfg: StateGraphView, sdfg: dace.SDFG, state_id: int) -> str:
def get_tasklet_byte_accesses(tasklet: nodes.CodeNode,
dfg: DataflowGraphView,
sdfg: dace.SDFG,
state_id: int) -> str:
""" Get the amount of bytes processed by `tasklet`. The formula is
sum(inedges * size) + sum(outedges * size) """
in_accum = []
Expand Down Expand Up @@ -693,7 +696,7 @@ def get_memory_input_size(node, sdfg, state_id) -> str:
return sym2cpp(input_size)

@staticmethod
def accumulate_byte_movement(outermost_node, node, dfg: StateGraphView, sdfg, state_id):
def accumulate_byte_movement(outermost_node, node, dfg: DataflowGraphView, sdfg, state_id):

itvars = dict() # initialize an empty dict

Expand Down
90 changes: 82 additions & 8 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def node_dispatch_predicate(self, sdfg, state, node):
if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes
if node.schedule in dtypes.GPU_SCHEDULES:
return True
if isinstance(node, nodes.NestedSDFG) and CUDACodeGen._in_device_code:
if CUDACodeGen._in_device_code:
return True
return False

Expand Down Expand Up @@ -1324,11 +1324,11 @@ def generate_devicelevel_state(self, sdfg, state, function_stream, callsite_stre

if write_scope == 'grid':
callsite_stream.write("if (blockIdx.x == 0 "
"&& threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
"&& threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
elif write_scope == 'block':
callsite_stream.write("if (threadIdx.x == 0) "
"{ // sub-graph begin", sdfg, state.node_id)
"{ // sub-graph begin", sdfg, state.node_id)
else:
callsite_stream.write("{ // subgraph begin", sdfg, state.node_id)
else:
Expand Down Expand Up @@ -2519,15 +2519,17 @@ def generate_devicelevel_scope(self, sdfg, dfg_scope, state_id, function_stream,
def generate_node(self, sdfg, dfg, state_id, node, function_stream, callsite_stream):
if self.node_dispatch_predicate(sdfg, dfg, node):
# Dynamically obtain node generator according to class name
gen = getattr(self, '_generate_' + type(node).__name__)
gen(sdfg, dfg, state_id, node, function_stream, callsite_stream)
return
gen = getattr(self, '_generate_' + type(node).__name__, False)
if gen is not False: # Not every node type has a code generator here
gen(sdfg, dfg, state_id, node, function_stream, callsite_stream)
return

if not CUDACodeGen._in_device_code:
self._cpu_codegen.generate_node(sdfg, dfg, state_id, node, function_stream, callsite_stream)
return

self._locals.clear_scope(self._code_state.indentation + 1)
if isinstance(node, nodes.ExitNode):
self._locals.clear_scope(self._code_state.indentation + 1)

if CUDACodeGen._in_device_code and isinstance(node, nodes.MapExit):
return # skip
Expand Down Expand Up @@ -2591,6 +2593,78 @@ def _generate_MapExit(self, sdfg, dfg, state_id, node, function_stream, callsite

self._cpu_codegen._generate_MapExit(sdfg, dfg, state_id, node, function_stream, callsite_stream)

def _get_thread_id(self) -> str:
result = 'threadIdx.x'
if self._block_dims[1] != 1:
result += f' + ({sym2cpp(self._block_dims[0])}) * threadIdx.y'
if self._block_dims[2] != 1:
result += f' + ({sym2cpp(self._block_dims[0] * self._block_dims[1])}) * threadIdx.z'
return result

def _get_warp_id(self) -> str:
return f'(({self._get_thread_id()}) / warpSize)'

def _get_block_id(self) -> str:
result = 'blockIdx.x'
if self._block_dims[1] != 1:
result += f' + gridDim.x * blockIdx.y'
if self._block_dims[2] != 1:
result += f' + gridDim.x * gridDim.y * blockIdx.z'
return result

def _generate_condition_from_location(self, name: str, index_expr: str, node: nodes.Tasklet,
callsite_stream: CodeIOStream) -> str:
if name not in node.location:
return 0

location: Union[int, str, subsets.Range] = node.location[name]
if isinstance(location, str) and ':' in location:
location = subsets.Range.from_string(location)
elif symbolic.issymbolic(location):
location = sym2cpp(location)

if isinstance(location, subsets.Range):
# Range of indices
if len(location) != 1:
raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given')
begin, end, stride = location[0]
rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride)
cond = ''
cond += f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})'
if stride != 1:
cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)'

callsite_stream.write(f'if ({cond}) {{')
else:
# Single-element
callsite_stream.write(f'if (({index_expr}) == {location}) {{')

return 1

def _generate_Tasklet(self, sdfg: SDFG, dfg, state_id: int, node: nodes.Tasklet, function_stream: CodeIOStream,
callsite_stream: CodeIOStream):
generated_preamble_scopes = 0
if self._in_device_code:
# If location dictionary prescribes that the code should run on a certain group of threads/blocks,
# add condition
generated_preamble_scopes += self._generate_condition_from_location('gpu_thread', self._get_thread_id(),
node, callsite_stream)
generated_preamble_scopes += self._generate_condition_from_location('gpu_warp', self._get_warp_id(), node,
callsite_stream)
generated_preamble_scopes += self._generate_condition_from_location('gpu_block', self._get_block_id(), node,
callsite_stream)

# Call standard tasklet generation
old_codegen = self._cpu_codegen.calling_codegen
self._cpu_codegen.calling_codegen = self
self._cpu_codegen._generate_Tasklet(sdfg, dfg, state_id, node, function_stream, callsite_stream)
self._cpu_codegen.calling_codegen = old_codegen

if generated_preamble_scopes > 0:
# Generate appropriate postamble
for i in range(generated_preamble_scopes):
callsite_stream.write('}', sdfg, state_id, node)

def make_ptr_vector_cast(self, *args, **kwargs):
return cpp.make_ptr_vector_cast(*args, **kwargs)

Expand Down
2 changes: 1 addition & 1 deletion dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def remove_name_collisions(sdfg: SDFG):
# Rename duplicate states
for state in nsdfg.nodes():
if state.label in state_names_seen:
state.set_label(data.find_new_name(state.label, state_names_seen))
state.label = data.find_new_name(state.label, state_names_seen)
state_names_seen.add(state.label)

replacements: Dict[str, str] = {}
Expand Down
5 changes: 2 additions & 3 deletions dace/sdfg/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,8 @@ def label(self):
def __label__(self, sdfg, state):
return self.data

def desc(self, sdfg):
from dace.sdfg import SDFGState, ScopeSubgraphView
if isinstance(sdfg, (SDFGState, ScopeSubgraphView)):
def desc(self, sdfg: Union['dace.sdfg.SDFG', 'dace.sdfg.SDFGState', 'dace.sdfg.ScopeSubgraphView']):
if isinstance(sdfg, (dace.sdfg.SDFGState, dace.sdfg.ScopeSubgraphView)):
sdfg = sdfg.parent
return sdfg.arrays[self.data]

Expand Down
29 changes: 15 additions & 14 deletions dace/sdfg/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,17 +175,18 @@ def replace_datadesc_names(sdfg, repl: Dict[str, str]):
sdfg.constants_prop[repl[aname]] = sdfg.constants_prop[aname]
del sdfg.constants_prop[aname]

# Replace in interstate edges
for e in sdfg.edges():
e.data.replace_dict(repl, replace_keys=False)

for state in sdfg.nodes():
# Replace in access nodes
for node in state.data_nodes():
if node.data in repl:
node.data = repl[node.data]

# Replace in memlets
for edge in state.edges():
if edge.data.data in repl:
edge.data.data = repl[edge.data.data]
for cf in sdfg.all_control_flow_regions():
# Replace in interstate edges
for e in cf.edges():
e.data.replace_dict(repl, replace_keys=False)

for block in cf.nodes():
if isinstance(block, dace.SDFGState):
# Replace in access nodes
for node in block.data_nodes():
if node.data in repl:
node.data = repl[node.data]
# Replace in memlets
for edge in block.edges():
if edge.data.data in repl:
edge.data.data = repl[edge.data.data]
Loading

0 comments on commit 26c17a8

Please sign in to comment.