diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp
index dd1002a8f931..387fc78c5d41 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Utils/StringUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -893,6 +894,30 @@ bool DeviceAffinityAttr::isExecutableWith(
   return false;
 }
 
+bool DeviceAffinityAttr::isTranslatableWith(
+    ModuleOp moduleOp, IREE::Stream::AffinityAttr other) const {
+  if (!other)
+    return true;
+
+  auto otherAffinityAttr = llvm::dyn_cast_if_present<DeviceAffinityAttr>(other);
+  if (!otherAffinityAttr)
+    return false;
+
+  SymbolTable symbolTable(moduleOp);
+  auto getExecutableTargets = [&](SymbolRefAttr symbol) {
+    auto globalOp = llvm::cast<IREE::Util::GlobalOp>(
+        symbolTable.lookupSymbolIn(moduleOp, symbol));
+    return globalOp.getInitialValueAttr();
+  };
+  auto device = llvm::dyn_cast<IREE::HAL::DeviceTargetAttr>(
+      getExecutableTargets(getDevice()));
+  auto otherDevice = llvm::dyn_cast<IREE::HAL::DeviceTargetAttr>(
+      getExecutableTargets(otherAffinityAttr.getDevice()));
+  if (!device || !otherDevice)
+    return false;
+  return device.getExecutableTargets() == otherDevice.getExecutableTargets();
+}
+
 IREE::Stream::AffinityAttr
 DeviceAffinityAttr::joinOR(IREE::Stream::AffinityAttr other) const {
   if (!other)
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td
index c0dbc59399ad..01d1fa2340f5 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALAttrs.td
@@ -853,6 +853,7 @@ def HAL_DeviceSelectAttr : AttrDef<HAL_Dialect, "DeviceSelect", [
 def HAL_DeviceAffinityAttr : AttrDef<HAL_Dialect, "DeviceAffinity", [
   DeclareAttrInterfaceMethods<Stream_AffinityAttr, [
     "isExecutableWith",
+    "isTranslatableWith",
     "joinOR",
     "joinAND",
   ]>,
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td
index 2b686b7478b6..79b292412527 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamInterfaces.td
@@ -59,6 +59,20 @@ def Stream_AffinityAttr : AttrInterface<"AffinityAttr"> {
         return IREE::Stream::AffinityAttr::areCompatible($_attr, other);
       }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns true if it can share the same execution configuration (e.g.,
+        translation artifacts) with the `other` and vice versa.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isTranslatableWith",
+      /*args=*/(ins "ModuleOp":$moduleOp,
+                    "IREE::Stream::AffinityAttr":$other),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return IREE::Stream::AffinityAttr::areTranslationCompatible(moduleOp, $_attr, other);
+      }]
+    >,
     InterfaceMethod<
       /*desc=*/[{
         Returns an affinity describing the union with |other| constraints.
@@ -118,6 +132,11 @@ def Stream_AffinityAttr : AttrInterface<"AffinityAttr"> {
     // Returns true if |lhs| and |rhs| indicate that their operations can
     // execute together on the same execution queue.
     static bool canExecuteTogether(AffinityAttr lhs, AffinityAttr rhs);
+
+    // Returns true if |lhs| and |rhs| are translation compatible. E.g., they
+    // are compatible if they have the same executable targets.
+    static bool areTranslationCompatible(ModuleOp moduleOp, AffinityAttr lhs,
+                                         AffinityAttr rhs);
   }];
 }
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp
index 82b8609e90e5..93dadaddf64d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamTypes.cpp
@@ -393,6 +393,16 @@ bool AffinityAttr::canExecuteTogether(AffinityAttr lhs, AffinityAttr rhs) {
   return lhs.isExecutableWith(rhs);
 }
 
+// static
+bool AffinityAttr::areTranslationCompatible(ModuleOp moduleOp, AffinityAttr lhs,
+                                            AffinityAttr rhs) {
+  if (lhs == rhs)
+    return true;
+  if ((lhs && !rhs) || (rhs && !lhs))
+    return true;
+  return lhs.isTranslatableWith(moduleOp, rhs);
+}
+
 //===----------------------------------------------------------------------===//
 // #stream.partitioning_config
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp
index 40571f341ab9..e6675c4fd70a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp
@@ -11,11 +11,13 @@
 #include "iree/compiler/Dialect/Stream/IR/StreamTraits.h"
 #include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
 #include "iree/compiler/Dialect/Stream/Transforms/Passes.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -52,6 +54,285 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
   return results;
 }
 
+/// Disjoint-set data structure holding non-overlapping sets of aliasing
+/// attributes.
+class AffinityAliasingSet {
+public:
+  void addAlias(Attribute aliasee, Attribute aliaser) {
+    auto aliaseeWithId = getWithId(aliasee);
+    auto aliaserWithId = getWithId(aliaser);
+    attrAliasing.unionSets(aliaseeWithId, aliaserWithId);
+  }
+
+  IREE::Stream::AffinityAttr findLeader(Attribute attr) {
+    return cast<IREE::Stream::AffinityAttr>(
+        attrAliasing.findLeader(getWithId(attr))->attr);
+  }
+
+private:
+  // EquivalenceClasses require ordering for attr type to return deterministic
+  // results, so we provide it by assigning id to all attrs added to the set.
+  struct NumberedAttribute {
+    Attribute attr;
+    int64_t id;
+
+    static Attribute getAttribute(const NumberedAttribute &attr) {
+      return attr.attr;
+    }
+  };
+
+  struct Comparator {
+    int operator()(const NumberedAttribute &a,
+                   const NumberedAttribute &b) const {
+      return a.id < b.id;
+    }
+  };
+
+  NumberedAttribute getWithId(Attribute attr) const {
+    auto [iterator, inserted] = id.try_emplace(attr, id.size());
+    return {attr, iterator->second};
+  }
+
+  mutable llvm::DenseMap<Attribute, int64_t> id;
+  llvm::EquivalenceClasses<NumberedAttribute, Comparator> attrAliasing;
+};
+
+/// Returns the aliased affinities of the `dispatchOp`'s resource operands. An
+/// empty array attribute indicates that the resource operand affinity is not
+/// found. Usually, it happens when it fails on affinity analysis.
+/// Note that the size of the result might not equal to the number of resource
+/// operands. If a resource operand type is not AffinityType, it is skipped.
+static SmallVector<Attribute> getAliasedResourceOperandsAffinities(
+    IREE::Stream::AffinityAnalysis &affinityAnalysis,
+    IREE::Stream::AsyncDispatchOp dispatchOp,
+    AffinityAliasingSet &aliasingSet) {
+  SmallVector<Attribute> result;
+  Builder b(dispatchOp.getContext());
+  auto emptyArray = b.getArrayAttr({});
+  for (auto operand : dispatchOp.getResourceOperands()) {
+    // Skip if the operand type is not AffinityType.
+    if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
+      continue;
+    }
+    SmallVector<IREE::Stream::AffinityAttr> affinities;
+    if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
+      result.push_back(emptyArray);
+      continue;
+    }
+    auto aliasedAffinities = llvm::map_to_vector(
+        affinities, [&](IREE::Stream::AffinityAttr attr) -> Attribute {
+          return aliasingSet.findLeader(attr);
+        });
+    result.push_back(b.getArrayAttr(aliasedAffinities));
+  }
+  return result;
+}
+
+/// Returns the aliased execution affinity. The method assumes that the lookup
+/// always succeeds and the number of execution affinities is one.
+static IREE::Stream::AffinityAttr
+getAliasedExecutionAffinity(IREE::Stream::AsyncDispatchOp dispatchOp,
+                            IREE::Stream::AffinityAnalysis &affinityAnalysis,
+                            AffinityAliasingSet &aliasingSet) {
+  SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+  [[maybe_unused]] bool succeed =
+      affinityAnalysis.tryLookupExecutionAffinity(dispatchOp, execAffinities);
+  assert(succeed && "failed on execution affinity lookup");
+  assert(execAffinities.size() == 1 && "We should only have a single execution "
+                                       "affinity when running the pass.");
+  return aliasingSet.findLeader(execAffinities[0]);
+}
+
+/// Returns the set of execution affinity and resource affinity from the `ops`.
+/// Returns failure if there are failures in affinity lookup.
+static FailureOr<SetVector<IREE::Stream::AffinityAttr>>
+collectAllTheAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
+                        ArrayRef<IREE::Stream::AsyncDispatchOp> ops) {
+  SetVector<IREE::Stream::AffinityAttr> attrs;
+  for (auto dispatchOp : ops) {
+    SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+    if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
+                                                     execAffinities)) {
+      return dispatchOp.emitError("failed on execution affinity lookup");
+    }
+    attrs.insert(execAffinities.begin(), execAffinities.end());
+
+    for (auto operand : dispatchOp.getResourceOperands()) {
+      // Skip if the operand type is not AffinityType.
+      if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
+        continue;
+      }
+      SmallVector<IREE::Stream::AffinityAttr> affinities;
+      if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
+        continue;
+      }
+      attrs.insert(affinities.begin(), affinities.end());
+    }
+  }
+  return attrs;
+}
+
+/// Duplicates stream.executables based on the affinity analysis of
+/// stream.async.dispatch ops. Some executables can be launched by different
+/// devices. It can produce wrong codegen artifacts when bindings types are
+/// encoded (i.e., the tensor type has an encoding attribute). Because they
+/// can result in different layouts, especially when multi-device is involved.
+/// E.g., say that device_a and device_b interpret a tensor type with
+/// encodings in different layouts, and there is an executable that can be
+/// launch with resources from either device_a or device_b. It is confusing
+/// what the input layouts for the executable because there are two
+/// possibilities. In this case, we have to duplicate the executable with
+/// updated encoding, and modify the dispatch to launch proper executable
+/// based on device analysis.
+static LogicalResult duplicateExecutablesPerAffinityVariant(
+    ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
+    IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
+  MLIRContext *ctx = moduleOp.getContext();
+  IRRewriter rewriter(ctx);
+
+  // ------------------------------------------------------------------------
+  // Gather per-export [execution affinity -> [resource affinities]] map.
+  // ------------------------------------------------------------------------
+
+  IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
+  if (failed(affinityAnalysis.run())) {
+    return moduleOp.emitError("failed on running affinity analysis");
+  }
+  SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
+  funcOp.walk(
+      [&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });
+
+  // export -> [affinity -> array per resource of affinities PVS].
+  DenseMap<IREE::Stream::ExecutableExportOp,
+           SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
+      exportToDispatchSites;
+
+  // Build equivalence classes for each affinity, giving us nice clustered
+  // sets. Because some affinities could share the same executable targets.
+  AffinityAliasingSet aliasingSet;
+  {
+    FailureOr<SetVector<IREE::Stream::AffinityAttr>> maybeAttrs =
+        collectAllTheAffinities(affinityAnalysis, candidates);
+    if (failed(maybeAttrs)) {
+      return funcOp.emitError("failed to collect all the affinities");
+    }
+    for (auto affinity0 : *maybeAttrs) {
+      for (auto affinity1 : *maybeAttrs) {
+        if (IREE::Stream::AffinityAttr::areTranslationCompatible(
+                moduleOp, affinity0, affinity1)) {
+          aliasingSet.addAlias(affinity0, affinity1);
+        }
+      }
+    }
+  }
+
+  llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
+      resourceAffinities;
+  for (auto dispatchOp : candidates) {
+    IREE::Stream::AffinityAttr aliasedExecAffinity =
+        getAliasedExecutionAffinity(dispatchOp, affinityAnalysis, aliasingSet);
+    SmallVector<Attribute> operandAffinityAttrs =
+        getAliasedResourceOperandsAffinities(affinityAnalysis, dispatchOp,
+                                             aliasingSet);
+    resourceAffinities[dispatchOp] = operandAffinityAttrs;
+
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      exportToDispatchSites[exportOp].insert(std::make_pair(
+          aliasedExecAffinity, rewriter.getArrayAttr(operandAffinityAttrs)));
+    });
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Dump of exportToDispatchSites\n";
+    for (auto [exportOp, affinities] : exportToDispatchSites) {
+      llvm::dbgs() << "  ExportOp: " << exportOp.getSymName() << "\n";
+      for (auto [execAffinity, resourceAffinities] : affinities) {
+        llvm::dbgs() << "    executaion affinity: " << execAffinity << "\n";
+        llvm::dbgs() << "    resource affinities: " << resourceAffinities
+                     << "\n";
+      }
+    }
+  });
+
+  // ------------------------------------------------------------------------
+  // Duplicate executables for each unqiue resource affinities.
+  // ------------------------------------------------------------------------
+
+  // Mapping from [execution affinity, resource operands affinities, export]
+  // to the executable op.
+  using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
+                                      IREE::Stream::ExecutableExportOp>;
+  DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
+      dispatchSiteToExecutableOp;
+  for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
+    auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
+    // No need to duplicate the executable if all the uses have the same
+    // affinities.
+    // TODO(hanchung): Do not duplicate the executables if bindings are not
+    // encoded. I.e., all the tensor types do not have encodings.
+    if (execAndResourceAffinities.size() == 1) {
+      auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = executableOp;
+      continue;
+    }
+
+    int64_t dupId = -1;
+    for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
+      rewriter.setInsertionPointAfter(executableOp);
+      IREE::Stream::ExecutableOp dupOp = executableOp;
+      if (dupId != -1) {
+        auto symName = std::string(executableOp.getSymName());
+        symName += "_dup" + std::to_string(dupId);
+        dupOp = rewriter.cloneWithoutRegions(executableOp);
+        rewriter.modifyOpInPlace(dupOp, [&] {
+          dupOp.setSymName(symName);
+          IRMapping mapping;
+          executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
+        });
+      }
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = dupOp;
+      dupId++;
+    }
+  }
+
+  // ------------------------------------------------------------------------
+  // Update dispatch sites, i.e., point dispatch entry points to corresponding
+  // cloned executables.
+  // ------------------------------------------------------------------------
+
+  for (auto dispatchOp : candidates) {
+    SmallVector<Attribute> newEntryPoints;
+    IREE::Stream::AffinityAttr aliasedExecAffinity =
+        getAliasedExecutionAffinity(dispatchOp, affinityAnalysis, aliasingSet);
+    SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      auto info = DispatchSiteInfo(
+          aliasedExecAffinity, rewriter.getArrayAttr(operandAttrs), exportOp);
+      assert(dispatchSiteToExecutableOp.count(info));
+
+      auto executableOp = dispatchSiteToExecutableOp[info];
+      auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
+                                           SymbolTable::getSymbolAttrName()),
+                                       entryPoint.getNestedReferences());
+      newEntryPoints.push_back(newSym);
+    });
+
+    rewriter.modifyOpInPlace(dispatchOp, [&] {
+      dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
+    });
+  }
+
+  // TODO(hanchung): Update encodings in executables.
+
+  return success();
+}
+
 // TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
 static RankedTensorType cloneWithEncoding(RankedTensorType type,
                                           Attribute encodingAttr) {
@@ -156,6 +437,7 @@ struct SpecializeEncodingsPass
       return signalPassFailure();
     }
 
+    SymbolTable symbolTable(moduleOp);
     llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
     for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
       executableOps[executableOp.getName()] = executableOp;
@@ -171,7 +453,11 @@ struct SpecializeEncodingsPass
         return signalPassFailure();
       }
 
-      // TODO(hanchung): Duplicate executables and update dispatch ops.
+      if (failed(duplicateExecutablesPerAffinityVariant(
+              moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
+        funcOp.emitError("failed on executable duplication");
+        return signalPassFailure();
+      }
     }
   }
 };
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir
index 5fab86a79e79..3fdc6aa006a1 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir
@@ -33,3 +33,94 @@ module {
 // CHECK:         %[[D0_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING0]]>
 // CHECK:         %[[D1_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING1]]>
 // CHECK:         return %[[D0_RES]], %[[D1_RES]]
+
+// -----
+
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
+  util.global private @device_a = #device_target_local_0_
+  util.global private @device_b = #device_target_local_1_
+  stream.executable private @ex {
+    stream.executable.export public @dispatch
+  }
+  util.func public @multi_device_with_same_executable_targets(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
+    %c16 = arith.constant 16 : index
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
+    %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
+    %2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
+    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
+    %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
+    stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
+    %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
+    %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
+    util.return %9 : !hal.buffer_view
+  }
+}
+
+// CHECK:       #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
+// CHECK:       #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
+// CHECK:       util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
+// CHECK:       util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
+// CHECK:       stream.executable private @[[$EX0:.+]] {
+// CHECK-NOT:   stream.executable private
+// CHECK-LABEL: util.func public @multi_device_with_same_executable_targets
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX0]]::@dispatch
+
+// -----
+
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
+#executable_target_x86_64 = #hal.executable.target<"llvm-cpu", "xyz", {encoding = #iree_cpu.cpu_encoding_layout<>, target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}>
+#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_x86_64]> : !hal.device
+#map = affine_map<(d0) -> (d0)>
+module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
+  util.global private @device_a = #device_target_local_0_
+  util.global private @device_b = #device_target_local_1_
+  stream.executable private @ex {
+    stream.executable.export public @dispatch
+  }
+  util.func public @multi_device_with_different_executable_targets(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
+    %c16 = arith.constant 16 : index
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
+    %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
+    %2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
+    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
+    %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
+    stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
+    %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
+    %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
+    util.return %9 : !hal.buffer_view
+  }
+}
+
+// CHECK:       #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
+// CHECK:       #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
+// CHECK:       util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
+// CHECK:       util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
+// CHECK:       stream.executable private @[[$EX0:.+]] {
+// CHECK:       stream.executable private @[[$EX1:.+]] {
+// CHECK-LABEL: util.func public @multi_device_with_different_executable_targets
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch