diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp index dd1002a8f931..387fc78c5d41 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp +++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp @@ -7,6 +7,7 @@ #include "iree/compiler/Dialect/HAL/IR/HALDialect.h" #include "iree/compiler/Dialect/HAL/IR/HALOps.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" +#include "iree/compiler/Dialect/Util/IR/UtilOps.h" #include "iree/compiler/Utils/StringUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -893,6 +894,30 @@ bool DeviceAffinityAttr::isExecutableWith( return false; } +bool DeviceAffinityAttr::isTranslatableWith( + ModuleOp moduleOp, IREE::Stream::AffinityAttr other) const { + if (!other) + return true; + + auto otherAffinityAttr = llvm::dyn_cast_if_present(other); + if (!otherAffinityAttr) + return false; + + SymbolTable symbolTable(moduleOp); + auto getExecutableTargets = [&](SymbolRefAttr symbol) { + auto globalOp = llvm::cast( + symbolTable.lookupSymbolIn(moduleOp, symbol)); + return globalOp.getInitialValueAttr(); + }; + auto device = llvm::dyn_cast( + getExecutableTargets(getDevice())); + auto otherDevice = llvm::dyn_cast( + getExecutableTargets(otherAffinityAttr.getDevice())); + if (!device || !otherDevice) + return false; + return device.getExecutableTargets() == otherDevice.getExecutableTargets(); +} + IREE::Stream::AffinityAttr DeviceAffinityAttr::joinOR(IREE::Stream::AffinityAttr other) const { if (!other) diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td index c0dbc59399ad..01d1fa2340f5 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td +++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td @@ -853,6 +853,7 @@ def HAL_DeviceSelectAttr : AttrDef, diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td index 2b686b7478b6..79b292412527 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td @@ -59,6 +59,20 @@ def Stream_AffinityAttr : AttrInterface<"AffinityAttr"> { return IREE::Stream::AffinityAttr::areCompatible($_attr, other); }] >, + InterfaceMethod< + /*desc=*/[{ + Returns true if it can share the same execution configuration (e.g., + translation artifacts) with the `other` and vice versa. + }], + /*retTy=*/"bool", + /*methodName=*/"isTranslatableWith", + /*args=*/(ins "ModuleOp":$moduleOp, + "IREE::Stream::AffinityAttr":$other), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return IREE::Stream::AffinityAttr::areTranslationCompatible(moduleOp, $_attr, other); + }] + >, InterfaceMethod< /*desc=*/[{ Returns an affinity describing the union with |other| constraints. @@ -118,6 +132,11 @@ def Stream_AffinityAttr : AttrInterface<"AffinityAttr"> { // Returns true if |lhs| and |rhs| indicate that their operations can // execute together on the same execution queue. static bool canExecuteTogether(AffinityAttr lhs, AffinityAttr rhs); + + // Returns true if |lhs| and |rhs| are translation compatible. E.g., they + // are compatible if they have the same executable targets. + static bool areTranslationCompatible(ModuleOp moduleOp, AffinityAttr lhs, + AffinityAttr rhs); }]; } diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp index 82b8609e90e5..93dadaddf64d 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp @@ -393,6 +393,16 @@ bool AffinityAttr::canExecuteTogether(AffinityAttr lhs, AffinityAttr rhs) { return lhs.isExecutableWith(rhs); } +// static +bool AffinityAttr::areTranslationCompatible(ModuleOp moduleOp, AffinityAttr lhs, + AffinityAttr rhs) { + if (lhs == rhs) + return true; + if ((lhs && !rhs) || (rhs && !lhs)) + return true; + return lhs.isTranslatableWith(moduleOp, rhs); +} + //===----------------------------------------------------------------------===// // #stream.partitioning_config //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp index 40571f341ab9..e6675c4fd70a 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp @@ -11,11 +11,13 @@ #include "iree/compiler/Dialect/Stream/IR/StreamTraits.h" #include "iree/compiler/Dialect/Stream/IR/StreamTypes.h" #include "iree/compiler/Dialect/Stream/Transforms/Passes.h" +#include "iree/compiler/Dialect/Util/IR/UtilOps.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/LogicalResult.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/SymbolTable.h" #include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" @@ -52,6 +54,285 @@ SmallVector gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) { return results; } +/// Disjoint-set data structure holding non-overlapping sets of aliasing +/// attributes. +class AffinityAliasingSet { +public: + void addAlias(Attribute aliasee, Attribute aliaser) { + auto aliaseeWithId = getWithId(aliasee); + auto aliaserWithId = getWithId(aliaser); + attrAliasing.unionSets(aliaseeWithId, aliaserWithId); + } + + IREE::Stream::AffinityAttr findLeader(Attribute attr) { + return cast( + attrAliasing.findLeader(getWithId(attr))->attr); + } + +private: + // EquivalenceClasses require ordering for attr type to return deterministic + // results, so we provide it by assigning id to all attrs added to the set. + struct NumberedAttribute { + Attribute attr; + int64_t id; + + static Attribute getAttribute(const NumberedAttribute &attr) { + return attr.attr; + } + }; + + struct Comparator { + int operator()(const NumberedAttribute &a, + const NumberedAttribute &b) const { + return a.id < b.id; + } + }; + + NumberedAttribute getWithId(Attribute attr) const { + auto [iterator, inserted] = id.try_emplace(attr, id.size()); + return {attr, iterator->second}; + } + + mutable llvm::DenseMap id; + llvm::EquivalenceClasses attrAliasing; +}; + +/// Returns the aliased affinities of the `dispatchOp`'s resource operands. An +/// empty array attribute indicates that the resource operand affinity is not +/// found. Usually, it happens when it fails on affinity analysis. +/// Note that the size of the result might not equal to the number of resource +/// operands. If a resource operand type is not AffinityType, it is skipped. +static SmallVector getAliasedResourceOperandsAffinities( + IREE::Stream::AffinityAnalysis &affinityAnalysis, + IREE::Stream::AsyncDispatchOp dispatchOp, + AffinityAliasingSet &aliasingSet) { + SmallVector result; + Builder b(dispatchOp.getContext()); + auto emptyArray = b.getArrayAttr({}); + for (auto operand : dispatchOp.getResourceOperands()) { + // Skip if the operand type is not AffinityType. + if (!isa(operand.getType())) { + continue; + } + SmallVector affinities; + if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) { + result.push_back(emptyArray); + continue; + } + auto aliasedAffinities = llvm::map_to_vector( + affinities, [&](IREE::Stream::AffinityAttr attr) -> Attribute { + return aliasingSet.findLeader(attr); + }); + result.push_back(b.getArrayAttr(aliasedAffinities)); + } + return result; +} + +/// Returns the aliased execution affinity. The method assumes that the lookup +/// always succeeds and the number of execution affinities is one. +static IREE::Stream::AffinityAttr +getAliasedExecutionAffinity(IREE::Stream::AsyncDispatchOp dispatchOp, + IREE::Stream::AffinityAnalysis &affinityAnalysis, + AffinityAliasingSet &aliasingSet) { + SmallVector execAffinities; + [[maybe_unused]] bool succeed = + affinityAnalysis.tryLookupExecutionAffinity(dispatchOp, execAffinities); + assert(succeed && "failed on execution affinity lookup"); + assert(execAffinities.size() == 1 && "We should only have a single execution " + "affinity when running the pass."); + return aliasingSet.findLeader(execAffinities[0]); +} + +/// Returns the set of execution affinity and resource affinity from the `ops`. +/// Returns failure if there are failures in affinity lookup. +static FailureOr> +collectAllTheAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis, + ArrayRef ops) { + SetVector attrs; + for (auto dispatchOp : ops) { + SmallVector execAffinities; + if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp, + execAffinities)) { + return dispatchOp.emitError("failed on execution affinity lookup"); + } + attrs.insert(execAffinities.begin(), execAffinities.end()); + + for (auto operand : dispatchOp.getResourceOperands()) { + // Skip if the operand type is not AffinityType. + if (!isa(operand.getType())) { + continue; + } + SmallVector affinities; + if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) { + continue; + } + attrs.insert(affinities.begin(), affinities.end()); + } + } + return attrs; +} + +/// Duplicates stream.executables based on the affinity analysis of +/// stream.async.dispatch ops. Some executables can be launched by different +/// devices. It can produce wrong codegen artifacts when bindings types are +/// encoded (i.e., the tensor type has an encoding attribute). Because they +/// can result in different layouts, especially when multi-device is involved. +/// E.g., say that device_a and device_b interpret a tensor type with +/// encodings in different layouts, and there is an executable that can be +/// launch with resources from either device_a or device_b. It is confusing +/// what the input layouts for the executable because there are two +/// possibilities. In this case, we have to duplicate the executable with +/// updated encoding, and modify the dispatch to launch proper executable +/// based on device analysis. +static LogicalResult duplicateExecutablesPerAffinityVariant( + ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp, + IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) { + MLIRContext *ctx = moduleOp.getContext(); + IRRewriter rewriter(ctx); + + // ------------------------------------------------------------------------ + // Gather per-export [execution affinity -> [resource affinities]] map. + // ------------------------------------------------------------------------ + + IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp); + if (failed(affinityAnalysis.run())) { + return moduleOp.emitError("failed on running affinity analysis"); + } + SmallVector candidates; + funcOp.walk( + [&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); }); + + // export -> [affinity -> array per resource of affinities PVS]. + DenseMap>> + exportToDispatchSites; + + // Build equivalence classes for each affinity, giving us nice clustered + // sets. Because some affinities could share the same executable targets. + AffinityAliasingSet aliasingSet; + { + FailureOr> maybeAttrs = + collectAllTheAffinities(affinityAnalysis, candidates); + if (failed(maybeAttrs)) { + return funcOp.emitError("failed to collect all the affinities"); + } + for (auto affinity0 : *maybeAttrs) { + for (auto affinity1 : *maybeAttrs) { + if (IREE::Stream::AffinityAttr::areTranslationCompatible( + moduleOp, affinity0, affinity1)) { + aliasingSet.addAlias(affinity0, affinity1); + } + } + } + } + + llvm::MapVector> + resourceAffinities; + for (auto dispatchOp : candidates) { + IREE::Stream::AffinityAttr aliasedExecAffinity = + getAliasedExecutionAffinity(dispatchOp, affinityAnalysis, aliasingSet); + SmallVector operandAffinityAttrs = + getAliasedResourceOperandsAffinities(affinityAnalysis, dispatchOp, + aliasingSet); + resourceAffinities[dispatchOp] = operandAffinityAttrs; + + dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) { + auto exportOp = cast( + symbolTable.lookupSymbolIn(moduleOp, entryPoint)); + exportToDispatchSites[exportOp].insert(std::make_pair( + aliasedExecAffinity, rewriter.getArrayAttr(operandAffinityAttrs))); + }); + } + + LLVM_DEBUG({ + llvm::dbgs() << "Dump of exportToDispatchSites\n"; + for (auto [exportOp, affinities] : exportToDispatchSites) { + llvm::dbgs() << " ExportOp: " << exportOp.getSymName() << "\n"; + for (auto [execAffinity, resourceAffinities] : affinities) { + llvm::dbgs() << " executaion affinity: " << execAffinity << "\n"; + llvm::dbgs() << " resource affinities: " << resourceAffinities + << "\n"; + } + } + }); + + // ------------------------------------------------------------------------ + // Duplicate executables for each unqiue resource affinities. + // ------------------------------------------------------------------------ + + // Mapping from [execution affinity, resource operands affinities, export] + // to the executable op. + using DispatchSiteInfo = std::tuple; + DenseMap + dispatchSiteToExecutableOp; + for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) { + auto executableOp = exportOp->getParentOfType(); + // No need to duplicate the executable if all the uses have the same + // affinities. + // TODO(hanchung): Do not duplicate the executables if bindings are not + // encoded. I.e., all the tensor types do not have encodings. + if (execAndResourceAffinities.size() == 1) { + auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0]; + dispatchSiteToExecutableOp[DispatchSiteInfo( + execAffinity, resourceAffinities, exportOp)] = executableOp; + continue; + } + + int64_t dupId = -1; + for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) { + rewriter.setInsertionPointAfter(executableOp); + IREE::Stream::ExecutableOp dupOp = executableOp; + if (dupId != -1) { + auto symName = std::string(executableOp.getSymName()); + symName += "_dup" + std::to_string(dupId); + dupOp = rewriter.cloneWithoutRegions(executableOp); + rewriter.modifyOpInPlace(dupOp, [&] { + dupOp.setSymName(symName); + IRMapping mapping; + executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping); + }); + } + dispatchSiteToExecutableOp[DispatchSiteInfo( + execAffinity, resourceAffinities, exportOp)] = dupOp; + dupId++; + } + } + + // ------------------------------------------------------------------------ + // Update dispatch sites, i.e., point dispatch entry points to corresponding + // cloned executables. + // ------------------------------------------------------------------------ + + for (auto dispatchOp : candidates) { + SmallVector newEntryPoints; + IREE::Stream::AffinityAttr aliasedExecAffinity = + getAliasedExecutionAffinity(dispatchOp, affinityAnalysis, aliasingSet); + SmallVector operandAttrs = resourceAffinities[dispatchOp]; + dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) { + auto exportOp = cast( + symbolTable.lookupSymbolIn(moduleOp, entryPoint)); + auto info = DispatchSiteInfo( + aliasedExecAffinity, rewriter.getArrayAttr(operandAttrs), exportOp); + assert(dispatchSiteToExecutableOp.count(info)); + + auto executableOp = dispatchSiteToExecutableOp[info]; + auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType( + SymbolTable::getSymbolAttrName()), + entryPoint.getNestedReferences()); + newEntryPoints.push_back(newSym); + }); + + rewriter.modifyOpInPlace(dispatchOp, [&] { + dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints)); + }); + } + + // TODO(hanchung): Update encodings in executables. + + return success(); +} + // TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType. static RankedTensorType cloneWithEncoding(RankedTensorType type, Attribute encodingAttr) { @@ -156,6 +437,7 @@ struct SpecializeEncodingsPass return signalPassFailure(); } + SymbolTable symbolTable(moduleOp); llvm::MapVector executableOps; for (auto executableOp : moduleOp.getOps()) { executableOps[executableOp.getName()] = executableOp; @@ -171,7 +453,11 @@ struct SpecializeEncodingsPass return signalPassFailure(); } - // TODO(hanchung): Duplicate executables and update dispatch ops. + if (failed(duplicateExecutablesPerAffinityVariant( + moduleOp, symbolTable, funcOp, resolveLayoutAttr))) { + funcOp.emitError("failed on executable duplication"); + return signalPassFailure(); + } } } }; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir index 5fab86a79e79..3fdc6aa006a1 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir @@ -33,3 +33,94 @@ module { // CHECK: %[[D0_RES:.+]] = stream.tensor.sizeof {{.+}} tensor // CHECK: %[[D1_RES:.+]] = stream.tensor.sizeof {{.+}} tensor // CHECK: return %[[D0_RES]], %[[D1_RES]] + +// ----- + +#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}> +#map = affine_map<(d0) -> (d0)> +#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device +#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device +module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} { + util.global private @device_a = #device_target_local_0_ + util.global private @device_b = #device_target_local_1_ + stream.executable private @ex { + stream.executable.export public @dispatch + } + util.func public @multi_device_with_same_executable_targets(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view { + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource{%c16} + %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint + %2 = stream.timepoint.await %1 => %0 : !stream.resource{%c16} + %3 = stream.async.transfer %2 : !stream.resource{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16} + %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16} + %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16} + %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16} + %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16} + %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint + stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence) + %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource{%c16} + %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource{%c16} -> !hal.buffer_view + util.return %9 : !hal.buffer_view + } +} + +// CHECK: #[[DEVICE_LOCAL_0:.+]] = #hal.device.target +// CHECK: #[[DEVICE_LOCAL_1:.+]] = #hal.device.target +// CHECK: util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]] +// CHECK: util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]] +// CHECK: stream.executable private @[[$EX0:.+]] { +// CHECK-NOT: stream.executable private +// CHECK-LABEL: util.func public @multi_device_with_same_executable_targets +// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch +// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX0]]::@dispatch + +// ----- + +#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}> +#executable_target_x86_64 = #hal.executable.target<"llvm-cpu", "xyz", {encoding = #iree_cpu.cpu_encoding_layout<>, target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> +#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device +#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_x86_64]> : !hal.device +#map = affine_map<(d0) -> (d0)> +module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} { + util.global private @device_a = #device_target_local_0_ + util.global private @device_b = #device_target_local_1_ + stream.executable private @ex { + stream.executable.export public @dispatch + } + util.func public @multi_device_with_different_executable_targets(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view { + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %element_type_f32 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major) + %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource{%c16} + %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint + %2 = stream.timepoint.await %1 => %0 : !stream.resource{%c16} + %3 = stream.async.transfer %2 : !stream.resource{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16} + %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16} + %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16} + %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16} + %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16} + %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint + stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence) + %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource{%c16} + %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource{%c16} -> !hal.buffer_view + util.return %9 : !hal.buffer_view + } +} + +// CHECK: #[[DEVICE_LOCAL_0:.+]] = #hal.device.target +// CHECK: #[[DEVICE_LOCAL_1:.+]] = #hal.device.target +// CHECK: util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]] +// CHECK: util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]] +// CHECK: stream.executable private @[[$EX0:.+]] { +// CHECK: stream.executable private @[[$EX1:.+]] { +// CHECK-LABEL: util.func public @multi_device_with_different_executable_targets +// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch +// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch