diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
index 02939d8eaf2c..d60e6b19c447 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
@@ -782,17 +782,11 @@ struct ConvertDispatchOp
       IREE::Flow::DispatchOp op, OneToNOpAdaptor adaptor,
       IREE::Stream::AffinityAttr executionAffinityAttr,
       ConversionPatternRewriter &rewriter) const override {
-    // Zero is going to be used for each operand to start.
-    auto zeroOffset = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
-
     // Query and resolve all operands and their sizes.
-    SmallVector<Value> dispatchOperands;
-    SmallVector<Value> dispatchOperandSizes;
-    SmallVector<Value> dispatchOperandOffsets;
-    SmallVector<Value> dispatchOperandEnds;
-    SmallVector<Value> dispatchOperandLengths;
+    SmallVector<Value> operands;
     SmallVector<Value> operandSizes;
-
+    SmallVector<Value> allOperandSizes;
+    SmallVector<Type> operandEncodings;
     for (auto [oldOperand, convertedOperands] :
          llvm::zip_equal(op.getArguments(), adaptor.getArguments())) {
       Value newOperand;
@@ -801,34 +795,36 @@ struct ConvertDispatchOp
             transferTensorOperands(op.getLoc(), oldOperand, convertedOperands,
                                    executionAffinityAttr, rewriter);
         newOperand = newOperandCast.resource;
-        dispatchOperandSizes.push_back(newOperandCast.resourceSize);
         operandSizes.push_back(newOperandCast.resourceSize);
-        dispatchOperandOffsets.push_back(zeroOffset);
-        dispatchOperandEnds.push_back(newOperandCast.resourceSize);
-        dispatchOperandLengths.push_back(newOperandCast.resourceSize);
+        allOperandSizes.push_back(newOperandCast.resourceSize);
+        operandEncodings.push_back(oldOperand.getType());
       } else {
-        operandSizes.push_back({});
+        allOperandSizes.push_back({});
+        operandEncodings.push_back(rewriter.getType<IREE::Util::UnusedType>());
         newOperand = convertedOperands.front();
       }
-      dispatchOperands.push_back(newOperand);
+      operands.push_back(newOperand);
     }
 
     // Construct result sizes or reuse tied operand sizes from above.
     SmallVector<Value> resultSizes;
     SmallVector<Type> resultTypes;
+    SmallVector<Type> resultEncodings;
     auto unknownType = rewriter.getType<IREE::Stream::ResourceType>();
     auto tiedOperandBase = op.getTiedOperandsIndexAndLength().first;
     for (auto result : llvm::enumerate(op.getResults())) {
       auto oldResultType = result.value().getType();
       if (!llvm::isa<ShapedType>(oldResultType)) {
         resultTypes.push_back(getTypeConverter()->convertType(oldResultType));
+        resultEncodings.push_back(rewriter.getType<IREE::Util::UnusedType>());
         continue;
       }
       auto tiedOperand = op.getTiedResultOperandIndex(result.index());
       if (tiedOperand.has_value()) {
         auto operandIndex = tiedOperand.value() - tiedOperandBase;
-        resultSizes.push_back(operandSizes[operandIndex]);
-        resultTypes.push_back(dispatchOperands[operandIndex].getType());
+        resultSizes.push_back(allOperandSizes[operandIndex]);
+        resultTypes.push_back(operands[operandIndex].getType());
+        resultEncodings.push_back(operandEncodings[operandIndex]);
       } else {
         auto resultDynamicDims = IREE::Util::buildDynamicDimsForValue(
             op.getLoc(), result.value(), rewriter);
@@ -836,15 +832,21 @@ struct ConvertDispatchOp
             buildResultSizeOf(op.getLoc(), result.value(), resultDynamicDims,
                               executionAffinityAttr, rewriter));
         resultTypes.push_back(unknownType);
+        resultEncodings.push_back(oldResultType);
       }
     }
 
-    auto newOp = rewriter.create<IREE::Stream::AsyncDispatchOp>(
+    auto newOp = rewriter.create<IREE::Stream::TensorDispatchOp>(
         op.getLoc(), resultTypes, flattenValues(adaptor.getWorkload()),
-        adaptor.getEntryPointsAttr(), dispatchOperands, dispatchOperandSizes,
-        dispatchOperandOffsets, dispatchOperandEnds, dispatchOperandLengths,
-        resultSizes, adaptor.getTiedOperandsAttr(), executionAffinityAttr);
-    newOp->setDialectAttrs(op->getDialectAttrs());
+        adaptor.getEntryPointsAttr(), operands, operandSizes,
+        rewriter.getTypeArrayAttr(operandEncodings), op.getArgumentDims(),
+        resultSizes, rewriter.getTypeArrayAttr(resultEncodings),
+        op.getResultDims(), adaptor.getTiedOperandsAttr(),
+        executionAffinityAttr);
+    newOp->setDialectAttrs(
+        llvm::make_filter_range(op->getDialectAttrs(), [](NamedAttribute attr) {
+          return attr.getName() != "stream.affinity";
+        }));
     SmallVector<SmallVector<Value>> replacementsVec = llvm::map_to_vector(
         llvm::zip_equal(newOp->getResults(), resultSizes), [](auto it) {
           return SmallVector<Value>{std::get<0>(it), std::get<1>(it)};
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir
index 063389fb4dfa..bd9bbc848db1 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir
@@ -3,11 +3,11 @@
 // CHECK-LABEL: @dispatchNoWorkload
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
 util.func public @dispatchNoWorkload(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
-  //      CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
-  //      CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@entry(%[[INPUT]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]]) :
-  // CHECK-SAME:     (!stream.resource<*>{%[[INPUT_SIZE]]}) -> !stream.resource<*>{%[[RESULT_SIZE]]}
+  // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
+  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry(%[[INPUT]]) :
+  // CHECK-SAME: (tensor<7x?x24x?xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[INPUT_SIZE]]}) -> tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.dispatch @ex::@entry(%input) : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim1, %dim3}
-  // return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
   util.return %0 : tensor<?x?x1024xf32>
 }
 
@@ -15,16 +15,17 @@ util.func public @dispatchNoWorkload(%input: tensor<7x?x24x?xf32>, %dim1: index,
 
 // CHECK-LABEL: @dispatch
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
-util.func public @dispatch(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
+util.func public @dispatch(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> (tensor<?x?x1024xf32>, tensor<1024x?x?xf32>) {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
-  //      CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
-  //      CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@entry[%c1, %c2, %c3](%[[INPUT]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]]) :
-  // CHECK-SAME:     (!stream.resource<*>{%[[INPUT_SIZE]]}) -> !stream.resource<*>{%[[RESULT_SIZE]]}
-  %0 = flow.dispatch @ex::@entry[%c1, %c2, %c3](%input) : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim1, %dim3}
-  // return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  util.return %0 : tensor<?x?x1024xf32>
+  // CHECK: %[[RESULT0_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
+  // CHECK: %[[RESULT1_SIZE:.+]] = stream.tensor.sizeof tensor<1024x?x?xf32>{%[[DIM3]], %[[DIM1]]}
+  // CHECK: %[[RESULTS:.+]]:2 = stream.tensor.dispatch @ex::@entry[%c1, %c2, %c3](%[[INPUT]]) :
+  // CHECK-SAME: (tensor<7x?x24x?xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[INPUT_SIZE]]}) -> (tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[RESULT0_SIZE]]}, tensor<1024x?x?xf32>{%[[DIM3]], %[[DIM1]]} in !stream.resource<*>{%[[RESULT1_SIZE]]})
+  %results:2 = flow.dispatch @ex::@entry[%c1, %c2, %c3](%input) : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> (tensor<?x?x1024xf32>{%dim1, %dim3}, tensor<1024x?x?xf32>{%dim3, %dim1})
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULT0_SIZE]], %[[RESULTS]]#1, %[[RESULT1_SIZE]] : !stream.resource<*>, index, !stream.resource<*>, index
+  util.return %results#0, %results#1 : tensor<?x?x1024xf32>, tensor<1024x?x?xf32>
 }
 
 // -----
@@ -36,9 +37,11 @@ util.func public @tiedDispatch(%input0: tensor<i32>, %input1: tensor<2x3xi32>) -
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   // CHECK: %[[T_SIZE:.+]] = stream.tensor.sizeof tensor<3x9xi32> : index
-  // CHECK: %[[T:.+]] = stream.async.dispatch @ex::@entry0[%c1, %c2, %c3](%[[INPUT0]][%c0 to %[[INPUT0_SIZE]] for %[[INPUT0_SIZE]]]) : (!stream.resource<*>{%[[INPUT0_SIZE]]}) -> !stream.resource<*>{%[[T_SIZE]]}
+  // CHECK: %[[T:.+]] = stream.tensor.dispatch @ex::@entry0[%c1, %c2, %c3](%[[INPUT0]]) :
+  // CHECK-SAME: (tensor<i32> in !stream.resource<*>{%[[INPUT0_SIZE]]}) -> tensor<3x9xi32> in !stream.resource<*>{%[[T_SIZE]]}
   %0 = flow.dispatch @ex::@entry0[%c1, %c2, %c3](%input0) : (tensor<i32>) -> tensor<3x9xi32>
-  // CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@entry1[%c1, %c2, %c3](%[[INPUT1]][%c0 to %[[INPUT1_SIZE]] for %[[INPUT1_SIZE]]], %[[T]][%c0 to %[[T_SIZE]] for %[[T_SIZE]]]) : (!stream.resource<*>{%[[INPUT1_SIZE]]}, !stream.resource<*>{%[[T_SIZE]]}) -> %[[T]]{%[[T_SIZE]]}
+  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry1[%c1, %c2, %c3](%[[INPUT1]], %[[T]]) :
+  // CHECK-SAME: (tensor<2x3xi32> in !stream.resource<*>{%[[INPUT1_SIZE]]}, tensor<3x9xi32> in !stream.resource<*>{%[[T_SIZE]]}) -> tensor<3x9xi32> in %[[T]]{%[[T_SIZE]]}
   %1 = flow.dispatch @ex::@entry1[%c1, %c2, %c3](%input1, %0) : (tensor<2x3xi32>, tensor<3x9xi32>) -> %0
   // CHECK: util.return %[[RESULT]], %[[T_SIZE]] : !stream.resource<*>, index
   util.return %1 : tensor<3x9xi32>
@@ -52,18 +55,20 @@ util.global private @device_b : !hal.device
 // CHECK-LABEL: @dispatchAffinity
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
 util.func public @dispatchAffinity(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> (tensor<?x?x1024xf32>, tensor<?x?x1024xf32>) {
-  //      CHECK: %[[INPUT_A:.+]] = stream.async.transfer %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%[[INPUT_SIZE]]}
-  //      CHECK: %[[RESULT0_SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device_a>) tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
-  //      CHECK: %[[RESULT0:.+]] = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@entry0(%[[INPUT_A]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]])
+  // CHECK: %[[INPUT_A:.+]] = stream.async.transfer %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%[[INPUT_SIZE]]}
+  // CHECK: %[[RESULT0_SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device_a>) tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
+  // CHECK: %[[RESULT0:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device_a>) @ex::@entry0(%[[INPUT_A]])
+  // CHECK-SAME: (tensor<7x?x24x?xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[INPUT_SIZE]]}) -> tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[RESULT0_SIZE]]}
   %0 = flow.dispatch @ex::@entry0(%input) {
     stream.affinity = #hal.device.affinity<@device_a>
   } : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim1, %dim3}
-  //      CHECK: %[[INPUT_B:.+]] = stream.async.transfer %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%[[INPUT_SIZE]]}
-  //      CHECK: %[[RESULT1_SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device_b>) tensor<?x?x1024xf32>{%[[DIM3]], %[[DIM1]]}
-  //      CHECK: %[[RESULT1:.+]] = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@entry1(%[[INPUT_B]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]])
+  // CHECK: %[[INPUT_B:.+]] = stream.async.transfer %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%[[INPUT_SIZE]]}
+  // CHECK: %[[RESULT1_SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device_b>) tensor<?x?x1024xf32>{%[[DIM3]], %[[DIM1]]}
+  // CHECK: %[[RESULT1:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device_b>) @ex::@entry1(%[[INPUT_B]])
+  // CHECK-SAME: (tensor<7x?x24x?xf32>{%[[DIM1]], %[[DIM3]]} in !stream.resource<*>{%[[INPUT_SIZE]]}) -> tensor<?x?x1024xf32>{%[[DIM3]], %[[DIM1]]} in !stream.resource<*>{%[[RESULT1_SIZE]]}
   %1 = flow.dispatch @ex::@entry1(%input) {
     stream.affinity = #hal.device.affinity<@device_b>
   } : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim3, %dim1}
-  // return %[[RESULT0]], %[[RESULT0_SIZE]], %[[RESULT1]], %[[RESULT1_SIZE]]
+  // CHECK: return %[[RESULT0]], %[[RESULT0_SIZE]], %[[RESULT1]], %[[RESULT1_SIZE]]
   util.return %0, %1 : tensor<?x?x1024xf32>, tensor<?x?x1024xf32>
 }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
index df9e5480ef90..4f61917ed439 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
@@ -141,16 +141,12 @@ util.global private @device : !hal.device
 // CHECK-LABEL: @tensorBarrierDispatch
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
 util.func public @tensorBarrierDispatch(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
-  %c0 = arith.constant 0 : index
-  %barrier = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
-  %0 = flow.dispatch @ex::@entry[%c0](%barrier) : (tensor<?x128xi8>{%dim0}) -> tensor<?x128xi8>{%dim0}
-
-  // CHECK: %[[C0:.+]] = arith.constant 0 : index
   // CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*>
-  // CHECK: %[[C0_2:.+]] = arith.constant 0 : index
+  %barrier = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
   // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor<?x128xi8>{%arg2} : index
-  // CHECK: %[[DISP:.+]] = stream.async.dispatch on(#hal.device.affinity<@device>) @ex::@entry[%[[C0]]](%[[BARRIER]][%[[C0_2]] to %[[DIM0]] for %[[DIM0]]])
-  // CHECK: util.return %[[DISP]], %[[SIZE]]
+  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]])
+  %0 = flow.dispatch @ex::@entry(%barrier) : (tensor<?x128xi8>{%dim0}) -> tensor<?x128xi8>{%dim0}
+  // CHECK: util.return %[[RESULT]], %[[SIZE]]
   util.return %0 : tensor<?x128xi8>
 }
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
index d973db0c34ca..619df9a79ece 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
@@ -1363,6 +1363,36 @@ void TensorStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
   // TODO(benvanik): combine multiple stores to the same target if contiguous.
 }
 
+//===----------------------------------------------------------------------===//
+// stream.tensor.dispatch
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct DeduplicateTensorDispatchEntryRefs final
+    : public OpRewritePattern<TensorDispatchOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TensorDispatchOp dispatchOp,
+                                PatternRewriter &rewriter) const override {
+    auto originalAttr = dispatchOp.getEntryPointsAttr();
+    auto newAttr = deduplicateArrayElements(originalAttr);
+    if (newAttr == originalAttr)
+      return failure();
+    rewriter.modifyOpInPlace(dispatchOp,
+                             [&]() { dispatchOp.setEntryPointsAttr(newAttr); });
+    return success();
+  }
+};
+
+} // namespace
+
+void TensorDispatchOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                   MLIRContext *context) {
+  // TODO(benvanik): maybe tied type/lifetime updates?
+  results.insert<ElideUnusedOp<TensorDispatchOp>>(context);
+  results.insert<DeduplicateTensorDispatchEntryRefs>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // stream.async.alloca
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index c3bf0cf3546c..4623a7bd6c64 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -82,7 +82,7 @@ static LogicalResult verifyOpDynamicDims(Operation *op, TypeRange types,
                                          ValueRange dynamicDims) {
   unsigned requiredCount = 0;
   for (auto type : types) {
-    if (auto shapedType = llvm::dyn_cast<ShapedType>(type)) {
+    if (auto shapedType = llvm::dyn_cast_if_present<ShapedType>(type)) {
       requiredCount += shapedType.getNumDynamicDims();
     }
   }
@@ -95,6 +95,28 @@ static LogicalResult verifyOpDynamicDims(Operation *op, TypeRange types,
   return success();
 }
 
+// Verifies that |dynamicDims| contains the appropriate number of dims for all
+// the dynamic dimensions in |type|.
+static LogicalResult verifyOpDynamicDimsRange(Operation *op,
+                                              ArrayAttr typesAttr,
+                                              ValueRange dynamicDims) {
+  unsigned requiredCount = 0;
+  for (auto attr : typesAttr) {
+    if (auto typeAttr = dyn_cast_if_present<TypeAttr>(attr)) {
+      if (auto shapedType = llvm::dyn_cast<ShapedType>(typeAttr.getValue())) {
+        requiredCount += shapedType.getNumDynamicDims();
+      }
+    }
+  }
+  if (dynamicDims.size() != requiredCount) {
+    return op->emitOpError()
+           << "type set has " << requiredCount
+           << " dynamic dimensions but only " << dynamicDims.size()
+           << " dimension values are attached";
+  }
+  return success();
+}
+
 // Verifies that |sizes| contains the appropriate number of sizes for all of the
 // sized types in |values|.
 static LogicalResult verifyOpValueSizes(Operation *op, ValueRange values,
@@ -367,6 +389,375 @@ static void printEncodedResourceOperands(OpAsmPrinter &p, Operation *op,
   p.printNewline();
 }
 
+//===----------------------------------------------------------------------===//
+// custom<EncodedShapedTypeList>
+//===----------------------------------------------------------------------===//
+// encoding{%dim0, %dim1} in type{%size0}, type, type{%size1}
+
+static ParseResult
+parseShapedType(OpAsmParser &parser, Type &type,
+                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dims) {
+  if (failed(parser.parseType(type))) {
+    return failure();
+  }
+  if (auto shapedType = dyn_cast<ShapedType>(type)) {
+    if (!shapedType.hasStaticShape()) {
+      SmallVector<OpAsmParser::UnresolvedOperand> dynamicDims;
+      if (failed(parser.parseLBrace()) ||
+          failed(parser.parseOperandList(dynamicDims,
+                                         shapedType.getNumDynamicDims(),
+                                         OpAsmParser::Delimiter::None)) ||
+          failed(parser.parseRBrace())) {
+        return failure();
+      }
+      dims.append(dynamicDims);
+    }
+  } else if (isa<IREE::Util::SizeAwareTypeInterface>(type)) {
+    OpAsmParser::UnresolvedOperand size;
+    if (failed(parser.parseLBrace()) || failed(parser.parseOperand(size)) ||
+        failed(parser.parseRBrace())) {
+      return failure();
+    }
+    dims.push_back(size);
+  }
+  return success();
+}
+
+static void printSizedType(OpAsmPrinter &p, Operation *op, Type type,
+                           Value size) {
+  p.printType(type);
+  p << "{";
+  p.printOperand(size);
+  p << "}";
+}
+
+static OperandRange printShapedType(OpAsmPrinter &p, Operation *op, Type type,
+                                    OperandRange dims) {
+  p.printType(type);
+  if (auto shapedType = dyn_cast<ShapedType>(type)) {
+    if (!shapedType.hasStaticShape()) {
+      if (dims.empty()) {
+        p << "{<<INVALID>>}";
+        return dims;
+      }
+      p << "{";
+      llvm::interleaveComma(dims.take_front(shapedType.getNumDynamicDims()), p,
+                            [&](Value value) { p.printOperand(value); });
+      p << "}";
+      dims = dims.drop_front(shapedType.getNumDynamicDims());
+    }
+  } else if (isa<IREE::Util::SizeAwareTypeInterface>(type)) {
+    p << "{";
+    p.printOperand(dims.front());
+    p << "}";
+    dims = dims.drop_front(1);
+  }
+  return dims;
+}
+
+static ParseResult parseEncodedShapedTypeList(
+    OpAsmParser &parser, SmallVectorImpl<Type> &types,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &sizes,
+    SmallVectorImpl<Type> &encodings,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &encodingDims) {
+  do {
+    Type type0;
+    SmallVector<OpAsmParser::UnresolvedOperand> dims0;
+    if (failed(parseShapedType(parser, type0, dims0))) {
+      return failure();
+    }
+    if (succeeded(parser.parseOptionalKeyword("in"))) {
+      Type type1;
+      SmallVector<OpAsmParser::UnresolvedOperand> dims1;
+      if (failed(parseShapedType(parser, type1, dims1))) {
+        return failure();
+      }
+      types.push_back(type1);
+      sizes.append(dims1);
+      encodings.push_back(type0);
+      encodingDims.append(dims0);
+    } else {
+      types.push_back(type0);
+      sizes.append(dims0);
+      encodings.push_back(IREE::Util::UnusedType::get(parser.getContext()));
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+  return success();
+}
+
+static void printEncodedShapedTypeList(OpAsmPrinter &p, Operation *op,
+                                       TypeRange types, OperandRange sizes,
+                                       ArrayAttr encodings,
+                                       OperandRange encodingDims) {
+  llvm::interleaveComma(
+      llvm::zip_equal(types, encodings.getAsValueRange<TypeAttr>()), p,
+      [&](std::tuple<Type, Type> it) {
+        auto [type, encoding] = it;
+        if (!isa<IREE::Util::UnusedType>(encoding)) {
+          encodingDims = printShapedType(p, op, encoding, encodingDims);
+          p << " in ";
+        }
+        sizes = printShapedType(p, op, type, sizes);
+      });
+}
+
+//===----------------------------------------------------------------------===//
+// custom<EncodedShapedResultList>
+//===----------------------------------------------------------------------===//
+// encoding{%dim0, %dim1} in type{%dim2}, type{%size}, %operand4
+//
+// Supported result formats:
+//   type{%size}
+//   %operand as type{%size}
+//   encoding{%dim0, %dim1} in %operand4
+//   encoding{%dim0, %dim1} in %operand4 as type{%size}
+
+static ParseResult parseEncodedShapedResultList(
+    OpAsmParser &parser, ArrayRef<OpAsmParser::UnresolvedOperand> operands,
+    TypeRange operandTypes,
+    ArrayRef<OpAsmParser::UnresolvedOperand> operandSizes,
+    SmallVectorImpl<Type> &resultTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &resultSizes,
+    SmallVectorImpl<Type> &resultEncodingTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &resultEncodingDims,
+    ArrayAttr &tiedOperands) {
+  SmallVector<int64_t> tiedOperandIndices;
+  do {
+    Type type0;
+    SmallVector<OpAsmParser::UnresolvedOperand> dims0;
+    auto typeResult = parser.parseOptionalType(type0);
+    if (typeResult.has_value() && succeeded(typeResult.value())) {
+      if (auto shapedType = dyn_cast<ShapedType>(type0)) {
+        if (!shapedType.hasStaticShape()) {
+          if (failed(parser.parseLBrace()) ||
+              failed(parser.parseOperandList(dims0,
+                                             shapedType.getNumDynamicDims(),
+                                             OpAsmParser::Delimiter::None)) ||
+              failed(parser.parseRBrace())) {
+            return failure();
+          }
+        }
+      } else if (auto sizedType =
+                     dyn_cast<IREE::Util::SizeAwareTypeInterface>(type0)) {
+        OpAsmParser::UnresolvedOperand size;
+        if (failed(parser.parseLBrace()) || failed(parser.parseOperand(size)) ||
+            failed(parser.parseRBrace())) {
+          return failure();
+        }
+        dims0.push_back(size);
+      }
+    }
+
+    // Type only:
+    if (failed(parser.parseOptionalKeyword("in"))) {
+      resultTypes.push_back(type0);
+      resultSizes.append(dims0);
+      resultEncodingTypes.push_back(
+          IREE::Util::UnusedType::get(parser.getContext()));
+      tiedOperandIndices.push_back(IREE::Util::TiedOpInterface::kUntiedIndex);
+      continue;
+    }
+
+    // Check for optional tied result reference.
+    OpAsmParser::UnresolvedOperand tiedResult;
+    auto res = parser.parseOptionalOperand(tiedResult);
+    Type resultType;
+    int64_t tiedOperandIndex = IREE::Util::TiedOpInterface::kUntiedIndex;
+    if (res.has_value() && succeeded(res.value())) {
+      tiedOperandIndex = findTiedOperand(tiedResult, operands);
+      if (tiedOperandIndex == IREE::Util::TiedOpInterface::kUntiedIndex) {
+        return parser.emitError(tiedResult.location,
+                                "tied operand not found for result reference ")
+               << tiedResult.name;
+      }
+      if (succeeded(parser.parseOptionalKeyword("as"))) {
+        // Type _may_ differ from the operand.
+        if (failed(parser.parseType(resultType))) {
+          return failure();
+        }
+      } else {
+        // Use the operands type.
+        resultType = operandTypes[tiedOperandIndex];
+      }
+    } else if (failed(parser.parseType(resultType))) {
+      return failure();
+    }
+
+    // Parse optional type dimensions (usually resource size here).
+    if (auto sizedType =
+            dyn_cast<IREE::Util::SizeAwareTypeInterface>(resultType)) {
+      OpAsmParser::UnresolvedOperand size;
+      if (failed(parser.parseLBrace()) || failed(parser.parseOperand(size)) ||
+          failed(parser.parseRBrace())) {
+        return failure();
+      }
+      resultSizes.push_back(size);
+    }
+
+    resultTypes.push_back(resultType);
+    resultEncodingTypes.push_back(type0);
+    resultEncodingDims.append(dims0);
+    tiedOperandIndices.push_back(tiedOperandIndex);
+  } while (succeeded(parser.parseOptionalComma()));
+  if (!tiedOperandIndices.empty()) {
+    tiedOperands = parser.getBuilder().getIndexArrayAttr(tiedOperandIndices);
+  }
+  return success();
+}
+
+static void printEncodedShapedResultList(
+    OpAsmPrinter &p, Operation *op, ValueRange operands, TypeRange operandTypes,
+    OperandRange operandSizes, TypeRange resultTypes, OperandRange resultSizes,
+    ArrayAttr resultEncodings, OperandRange resultEncodingDims,
+    ArrayAttr tiedOperands) {
+  auto tiedOp = dyn_cast<IREE::Util::TiedOpInterface>(op);
+  for (unsigned i = 0; i < resultTypes.size(); ++i) {
+    auto resultEncodingType =
+        cast<TypeAttr>(resultEncodings.getValue()[i]).getValue();
+    if (!isa<IREE::Util::UnusedType>(resultEncodingType)) {
+      p.printType(resultEncodingType);
+      if (auto shapedType = dyn_cast<ShapedType>(resultEncodingType)) {
+        if (!shapedType.hasStaticShape()) {
+          if (resultEncodingDims.empty()) {
+            p << "{<<INVALID>>}";
+            return;
+          }
+          p << "{";
+          llvm::interleaveComma(
+              resultEncodingDims.take_front(shapedType.getNumDynamicDims()), p,
+              [&](Value value) { p.printOperand(value); });
+          p << "}";
+          resultEncodingDims =
+              resultEncodingDims.drop_front(shapedType.getNumDynamicDims());
+        }
+      } else if (auto sizedType = dyn_cast<IREE::Util::SizeAwareTypeInterface>(
+                     resultEncodingType)) {
+        p << "{";
+        p.printOperand(resultEncodingDims.front());
+        p << "}";
+        resultEncodingDims = resultEncodingDims.drop_front(1);
+      }
+      p << " in ";
+    }
+    auto resultType = resultTypes[i];
+    auto tiedOperandIndex =
+        tiedOp ? tiedOp.getTiedResultOperandIndex(i) : std::nullopt;
+    bool printType = true;
+    if (tiedOperandIndex.has_value()) {
+      auto tiedOperand = op->getOperand(tiedOperandIndex.value());
+      p.printOperand(tiedOperand);
+      if (tiedOperand.getType() != resultType) {
+        p << " as ";
+      } else {
+        // Type elided as it matches the operand.
+        printType = false;
+      }
+    }
+    if (printType) {
+      p.printType(resultType);
+    }
+    if (auto sizedType =
+            dyn_cast<IREE::Util::SizeAwareTypeInterface>(resultType)) {
+      p << "{";
+      p.printOperand(resultSizes.front());
+      p << "}";
+      resultSizes = resultSizes.drop_front(1);
+    }
+    if (i < resultTypes.size() - 1) {
+      p << ", ";
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// custom<EncodedShapedFunctionType>
+//===----------------------------------------------------------------------===//
+// (type, encoding{%dim0, %dim1} in type{%size}, type) ->
+//     (encoding{%dim} in type{%size}, %operand4)
+
+static ParseResult parseEncodedShapedFunctionType(
+    OpAsmParser &parser, ArrayRef<OpAsmParser::UnresolvedOperand> operands,
+    SmallVectorImpl<Type> &operandTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandSizes,
+    ArrayAttr &operandEncodings,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandEncodingDims,
+    SmallVectorImpl<Type> &resultTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &resultSizes,
+    ArrayAttr &resultEncodings,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &resultEncodingDims,
+    ArrayAttr &tiedOperands) {
+  SmallVector<Type> operandEncodingTypes;
+  SmallVector<Type> resultEncodingTypes;
+  if (failed(parser.parseLParen())) {
+    return failure();
+  }
+  if (failed(parser.parseOptionalRParen())) {
+    if (failed(parseEncodedShapedTypeList(parser, operandTypes, operandSizes,
+                                          operandEncodingTypes,
+                                          operandEncodingDims)) ||
+        failed(parser.parseRParen())) {
+      return failure();
+    }
+  }
+  if (failed(parser.parseArrow())) {
+    return failure();
+  }
+  if (succeeded(parser.parseOptionalLParen())) {
+    if (succeeded(parser.parseOptionalRParen())) {
+      // Empty list/no results `()`.
+    } else {
+      // One or more result types.
+      if (failed(parseEncodedShapedResultList(
+              parser, operands, operandTypes, operandSizes, resultTypes,
+              resultSizes, resultEncodingTypes, resultEncodingDims,
+              tiedOperands)) ||
+          failed(parser.parseRParen())) {
+        return failure();
+      }
+    }
+  } else {
+    // Single result with omitted `()`.
+    if (failed(parseEncodedShapedResultList(
+            parser, operands, operandTypes, operandSizes, resultTypes,
+            resultSizes, resultEncodingTypes, resultEncodingDims,
+            tiedOperands))) {
+      return failure();
+    }
+  }
+  operandEncodings = ArrayAttr::get(
+      parser.getContext(),
+      llvm::map_to_vector(operandEncodingTypes, [](Type type) -> Attribute {
+        return type ? TypeAttr::get(type) : Attribute{};
+      }));
+  resultEncodings = ArrayAttr::get(
+      parser.getContext(),
+      llvm::map_to_vector(resultEncodingTypes, [](Type type) -> Attribute {
+        return TypeAttr::get(type);
+      }));
+  return success();
+}
+
+static void printEncodedShapedFunctionType(
+    OpAsmPrinter &p, Operation *op, ValueRange operands, TypeRange operandTypes,
+    OperandRange operandSizes, ArrayAttr operandEncodings,
+    OperandRange operandEncodingDims, TypeRange resultTypes,
+    OperandRange resultSizes, ArrayAttr resultEncodings,
+    OperandRange resultEncodingDims, ArrayAttr tiedOperands) {
+  p << "(";
+  printEncodedShapedTypeList(p, op, operandTypes, operandSizes,
+                             operandEncodings, operandEncodingDims);
+  p << ") -> ";
+  if (resultTypes.size() != 1) {
+    p << "(";
+  }
+  printEncodedShapedResultList(p, op, operands, operandTypes, operandSizes,
+                               resultTypes, resultSizes, resultEncodings,
+                               resultEncodingDims, tiedOperands);
+  if (resultTypes.size() != 1) {
+    p << ")";
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // custom<ParameterLoadOperations>(
 //     $source_scope, $source_keys, $source_offsets,
@@ -1704,6 +2095,70 @@ ValueRange TensorTraceOp::getResultDynamicDims(unsigned idx) {
   return ValueRange{};
 }
 
+//===----------------------------------------------------------------------===//
+// stream.tensor.dispatch
+//===----------------------------------------------------------------------===//
+
+LogicalResult TensorDispatchOp::verify() {
+  TensorDispatchOp op = *this;
+  if (failed(verifyOpValueSizes(op, op.getMixedOperands(),
+                                op.getOperandSizes())) ||
+      failed(verifyOpValueSizes(op, op.getResults(), op.getResultSizes()))) {
+    return failure();
+  }
+  if (failed(verifyOpDynamicDimsRange(op, op.getOperandEncodings(),
+                                      op.getOperandEncodingDims())) ||
+      failed(verifyOpDynamicDimsRange(op, op.getResultEncodings(),
+                                      op.getResultEncodingDims()))) {
+    return failure();
+  }
+  return success();
+}
+
+static LogicalResult
+verifyDispatchSymbolUses(Operation *op, ArrayAttr entryPointsAttr,
+                         ValueRange workload,
+                         SymbolTableCollection &symbolTable) {
+  auto entryPointAttrs = entryPointsAttr.getAsRange<SymbolRefAttr>();
+  if (entryPointAttrs.empty()) {
+    return op->emitOpError() << "at least one entry point must be defined";
+  }
+  for (auto entryPointAttr : entryPointAttrs) {
+    auto exportOp =
+        symbolTable.lookupNearestSymbolFrom<IREE::Stream::ExecutableExportOp>(
+            op, entryPointAttr);
+    if (!exportOp) {
+      // TODO(benvanik): there are a lot of tests that are assuming this is not
+      // verified. We'll need to go add dummy executables for all of them. Today
+      // we just bail on the verifier if the symbol isn't found.
+      //
+      // Should be:
+      //   return op->emitOpError() << "undefined entry point: " <<
+      //   entry_point();
+      return success();
+    }
+
+    // Verify that the workload parameters captured match the target export.
+    if (failed(verifyDispatchWorkload(op, exportOp, workload))) {
+      return failure();
+    }
+
+    // TODO(benvanik): verify that the target function has matching operands.
+  }
+  return success();
+}
+
+LogicalResult
+TensorDispatchOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  return verifyDispatchSymbolUses(getOperation(), getEntryPointsAttr(),
+                                  getWorkload(), symbolTable);
+}
+
+std::pair<unsigned, unsigned>
+TensorDispatchOp::getTiedOperandsIndexAndLength() {
+  return getODSOperandIndexAndLength(1); // $operands
+}
+
 //===----------------------------------------------------------------------===//
 // stream.async.alloca
 //===----------------------------------------------------------------------===//
@@ -2220,34 +2675,8 @@ LogicalResult AsyncDispatchOp::verify() {
 
 LogicalResult
 AsyncDispatchOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
-  Operation *op = getOperation();
-  auto entryPointRefs = getEntryPointRefs();
-  if (entryPointRefs.empty()) {
-    return emitOpError() << "at least one entry point must be defined";
-  }
-  for (auto entryPointAttr : entryPointRefs) {
-    auto exportOp =
-        symbolTable.lookupNearestSymbolFrom<IREE::Stream::ExecutableExportOp>(
-            op, entryPointAttr);
-    if (!exportOp) {
-      // TODO(benvanik): there are a lot of tests that are assuming this is not
-      // verified. We'll need to go add dummy executables for all of them. Today
-      // we just bail on the verifier if the symbol isn't found.
-      //
-      // Should be:
-      //   return op->emitOpError() << "undefined entry point: " <<
-      //   entry_point();
-      return success();
-    }
-
-    // Verify that the workload parameters captured match the target export.
-    if (failed(verifyDispatchWorkload(op, exportOp, getWorkload()))) {
-      return failure();
-    }
-
-    // TODO(benvanik): verify that the target function has matching operands.
-  }
-  return success();
+  return verifyDispatchSymbolUses(getOperation(), getEntryPointsAttr(),
+                                  getWorkload(), symbolTable);
 }
 
 std::pair<unsigned, unsigned> AsyncDispatchOp::getTiedOperandsIndexAndLength() {
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index bfd755a28279..8ed4bca948fa 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -1739,6 +1739,80 @@ def Stream_TensorTraceOp : Stream_Op<"tensor.trace", [
   let hasVerifier = 1;
 }
 
+def Stream_TensorDispatchOp : Stream_Op<"tensor.dispatch", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+  Stream_AffinityOp,
+  Stream_TensorPhaseOp,
+  Stream_StreamableOp,
+  Util_SizeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedOperandsIndexAndLength",
+  ]>,
+]> {
+  let summary = [{dispatches a parallelized grid of work}];
+  let description = [{
+    Calls the specified entry point function once for each element in the
+    specified workgroup count. Each workgroup has access to the same operands
+    and results and is able to load/store at will.
+  }];
+
+  let arguments = (ins
+    Variadic<Index>:$workload,
+    SymbolRefArrayAttr:$entry_points,
+    Variadic<AnyTypeOf<[
+      Stream_AnyStreamResource,
+      Stream_PrimitiveType,
+    ]>>:$mixed_operands,
+    Variadic<Stream_Size>:$operand_sizes,
+    TypeArrayAttr:$operand_encodings,
+    Stream_ShapeDynamicDims:$operand_encoding_dims,
+    Variadic<Stream_Size>:$result_sizes,
+    TypeArrayAttr:$result_encodings,
+    Stream_ShapeDynamicDims:$result_encoding_dims,
+    OptionalAttr<Util_TiedOpStorageAttr>:$tied_operands,
+    OptionalAttr<Stream_AffinityAttr>:$affinity
+  );
+  let results = (outs
+    Variadic<Stream_AnyStreamResource>:$results
+  );
+
+  let assemblyFormat = [{
+    (`on` `(` $affinity^ `)`)?
+    custom<DispatchEntryPoints>($entry_points)
+    (`[` $workload^ `]`)? ``
+    `(` $mixed_operands `)`
+    attr-dict `:`
+    custom<EncodedShapedFunctionType>(
+        ref($mixed_operands),
+        type($mixed_operands), $operand_sizes,
+        $operand_encodings, $operand_encoding_dims,
+        type($results), $result_sizes,
+        $result_encodings, $result_encoding_dims,
+        $tied_operands)
+  }];
+
+  let extraClassDeclaration = [{
+    auto getEntryPointRefs() {
+      return getEntryPoints().getAsRange<SymbolRefAttr>();
+    }
+    void forEachEntryPointAttr(std::function<void(SymbolRefAttr)> fn) {
+      for (auto entryPointAttr : getEntryPointRefs()) fn(entryPointAttr);
+    }
+
+    Value getOperandSize(unsigned idx) {
+      return IREE::Util::findValueSizeInList(idx, getOperands(), getOperandSizes());
+    }
+    Value getResultSize(unsigned idx) {
+      return IREE::Util::findValueSizeInList(idx, getResults(), getResultSizes());
+    }
+  }];
+
+  let hasVerifier = 1;
+
+  let hasCanonicalizer = 1;
+}
+
 } // OpGroupTensorOps
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir
index a224e4ade0a3..c6d20e13acca 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir
@@ -151,3 +151,18 @@ util.func private @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_siz
   ]
   util.return
 }
+
+// -----
+
+// CHECK-LABEL: @tensorDispatch
+util.func private @tensorDispatch(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> !stream.resource<*> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  // CHECK: = stream.tensor.dispatch @executable::@dispatch[%c1, %c2, %c3](%arg0, %c4) :
+  // CHECK-SAME: (tensor<4x?xf32>{%arg2} in !stream.resource<*>{%arg1}, index) -> tensor<?x4xf32>{%arg2} in %arg0{%arg1}
+  %0 = stream.tensor.dispatch @executable::@dispatch[%c1, %c2, %c3](%arg0, %c4) : (tensor<4x?xf32>{%arg2} in !stream.resource<*>{%arg1}, index) -> tensor<?x4xf32>{%arg2} in %arg0{%arg1}
+  util.return %0 : !stream.resource<*>
+}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
index d831a44b459c..c3753aab0dfe 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
@@ -589,6 +589,41 @@ struct EncodeTensorStoreOp
   }
 };
 
+//===----------------------------------------------------------------------===//
+// stream.tensor.dispatch
+//===----------------------------------------------------------------------===//
+
+struct EncodeTensorDispatchOp
+    : public OpRewritePattern<IREE::Stream::TensorDispatchOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(IREE::Stream::TensorDispatchOp op,
+                                PatternRewriter &rewriter) const override {
+    // Strip off the tensor encoding information - it's not used at all here. If
+    // we changed the tensor dispatch op to accept indices and lengths for
+    // offsetting we would need to account for that here but today we require
+    // that to happen on slices/updates instead.
+    Value zeroOffset = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
+    SmallVector<Value> operandOffsets;
+    SmallVector<Value> operandEnds;
+    SmallVector<Value> operandLengths;
+    auto operandSizes = op.getOperandSizes();
+    for (auto operand : op.getMixedOperands()) {
+      if (isa<IREE::Stream::ResourceType>(operand.getType())) {
+        operandOffsets.push_back(zeroOffset);
+        operandEnds.push_back(operandSizes.front());
+        operandLengths.push_back(operandSizes.front());
+        operandSizes = operandSizes.drop_front(1);
+      }
+    }
+    rewriter.replaceOpWithNewOp<IREE::Stream::AsyncDispatchOp>(
+        op, op.getResultTypes(), op.getWorkload(), op.getEntryPointsAttr(),
+        op.getMixedOperands(), op.getOperandSizes(), operandOffsets,
+        operandEnds, operandLengths, op.getResultSizes(),
+        op.getTiedOperandsAttr(), op.getAffinityAttr());
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // --iree-stream-encode-host-tensors
 //===----------------------------------------------------------------------===//
@@ -602,8 +637,8 @@ struct EncodeHostTensorsPass
         EncodeTensorImportOp, EncodeTensorExportOp, EncodeTensorSizeOfOp,
         EncodeTensorEmptyOp, EncodeTensorConstantOp, EncodeTensorSplatOp,
         EncodeTensorCloneOp, EncodeTensorSliceOp, EncodeTensorFillOp,
-        EncodeTensorUpdateOp, EncodeTensorLoadOp, EncodeTensorStoreOp>(
-        &getContext());
+        EncodeTensorUpdateOp, EncodeTensorLoadOp, EncodeTensorStoreOp,
+        EncodeTensorDispatchOp>(&getContext());
     FrozenRewritePatternSet frozenPatterns(std::move(patterns));
     if (failed(applyPatternsGreedily(getOperation(), frozenPatterns))) {
       return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index c379ed2dac47..138ba0be6689 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -28,8 +28,8 @@ iree_lit_test_suite(
             "encode_device_tensors.mlir",
             "encode_device_tensors_packing.mlir",
             "encode_host_tensors.mlir",
+            "encode_host_tensors_encoding.mlir",
             "encode_host_tensors_packing.mlir",
-            "encode_host_tensors_packing_i1_attr.mlir",
             "encode_host_tensors_packing_i1_experimental_clopt.mlir",
             "fold_globals.mlir",
             "fold_uniform_operands.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
index 48e6ccf5b3f8..4c4cb93d80ef 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
@@ -26,8 +26,8 @@ iree_lit_test_suite(
     "encode_device_tensors.mlir"
     "encode_device_tensors_packing.mlir"
     "encode_host_tensors.mlir"
+    "encode_host_tensors_encoding.mlir"
     "encode_host_tensors_packing.mlir"
-    "encode_host_tensors_packing_i1_attr.mlir"
     "encode_host_tensors_packing_i1_experimental_clopt.mlir"
     "fold_globals.mlir"
     "fold_uniform_operands.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir
index 8815f6103f78..9e6600d2dcf4 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir
@@ -38,7 +38,7 @@ util.func public @simple_mul(%arg0: !hal.buffer_view) -> !hal.buffer_view attrib
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   // CHECK: %[[RET0_SIZE:.+]] = stream.tensor.sizeof tensor<?xf32>{%[[DIM0]]} : index
-  // CHECK: %[[RET0:.+]] = stream.async.dispatch @executable::@dispatch[%c2, %c1, %c1](%[[ARG0_T]][%c0 to %[[ARG0_SIZE]] for %[[ARG0_SIZE]]]) : (!stream.resource<*>{%[[ARG0_SIZE]]}) -> !stream.resource<*>{%[[RET0_SIZE]]}
+  // CHECK: %[[RET0:.+]] = stream.tensor.dispatch @executable::@dispatch[%c2, %c1, %c1](%[[ARG0_T]]) : (tensor<?x4xf32>{%[[DIM0]]} in !stream.resource<*>{%[[ARG0_SIZE]]}) -> tensor<?xf32>{%[[DIM0]]} in !stream.resource<*>{%[[RET0_SIZE]]}
   %1 = flow.dispatch @executable::@dispatch[%c2, %c1, %c1](%0) : (tensor<?x4xf32>{%dim0}) -> tensor<?xf32>{%dim0}
 
   // CHECK: %[[RET0_T:.+]] = stream.async.transfer %[[RET0]] : !stream.resource<*>{%[[RET0_SIZE]]} -> !stream.resource<external>{%[[RET0_SIZE]]}
@@ -136,7 +136,7 @@ util.func public @while_test() {
 // CHECK: ^bb1(%[[BB1_ARG:.+]]: !stream.resource<*>, %[[BB1_ARG_SIZE:.+]]: index):
 ^bb1(%1: tensor<i32>):
   // CHECK: %[[COND_SIZE:.+]] = stream.tensor.sizeof tensor<i1> : index
-  // CHECK: %[[COND_RESOURCE:.+]] = stream.async.dispatch @while_test_dispatch_0::@dispatch[%c1, %c1, %c1](%[[BB1_ARG]][%c0{{[_0-9]*}} to %[[BB1_ARG_SIZE]] for %[[BB1_ARG_SIZE]]]) : (!stream.resource<*>{%[[BB1_ARG_SIZE]]}) -> !stream.resource<*>{%[[COND_SIZE]]}
+  // CHECK: %[[COND_RESOURCE:.+]] = stream.tensor.dispatch @while_test_dispatch_0::@dispatch[%c1, %c1, %c1](%[[BB1_ARG]]) : (tensor<i32> in !stream.resource<*>{%[[BB1_ARG_SIZE]]}) -> tensor<i1> in !stream.resource<*>{%[[COND_SIZE]]}
   %2 = flow.dispatch @while_test_dispatch_0::@dispatch[%c1, %c1, %c1](%1) : (tensor<i32>) -> tensor<i1>
 
   // CHECK: %[[READBACK:.+]] = stream.async.transfer %[[COND_RESOURCE]] : !stream.resource<*>{%[[COND_SIZE]]} -> !stream.resource<staging>{%[[COND_SIZE]]}
@@ -149,7 +149,7 @@ util.func public @while_test() {
 // CHECK: ^bb2:
 ^bb2:
   // CHECK: %[[BB2_VAR_SIZE:.+]] = stream.tensor.sizeof tensor<i32> : index
-  // CHECK: %[[BB2_VAR:.+]] = stream.async.dispatch @while_test_dispatch_1::@dispatch[%c1, %c1, %c1](%[[BB1_ARG]][%c0{{[_0-9]*}} to %[[BB1_ARG_SIZE]] for %[[BB1_ARG_SIZE]]]) : (!stream.resource<*>{%[[BB1_ARG_SIZE]]}) -> !stream.resource<*>{%[[BB2_VAR_SIZE]]}
+  // CHECK: %[[BB2_VAR:.+]] = stream.tensor.dispatch @while_test_dispatch_1::@dispatch[%c1, %c1, %c1](%[[BB1_ARG]]) : (tensor<i32> in !stream.resource<*>{%[[BB1_ARG_SIZE]]}) -> tensor<i32> in !stream.resource<*>{%[[BB2_VAR_SIZE]]}
   %4 = flow.dispatch @while_test_dispatch_1::@dispatch[%c1, %c1, %c1](%1) : (tensor<i32>) -> tensor<i32>
 
   // CHECK: cf.br ^bb1(%[[BB2_VAR]], %[[BB2_VAR_SIZE]] : !stream.resource<*>, index)
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
index ffe248aee796..9a97b9ec8323 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
@@ -4,317 +4,29 @@
 util.func public @denseTensorSizeOf(%arg0: index) -> index {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 20 : index
   // CHECK: %[[DYNAMIC_SIZE:.+]] = arith.muli %arg0, %[[STATIC_SIZE]] : index
-  %0 = stream.tensor.sizeof tensor<?x5xf32>{%arg0} : index
+  %dynamic_size = stream.tensor.sizeof tensor<?x5xf32>{%arg0} : index
   // CHECK: util.return %[[DYNAMIC_SIZE]]
-  util.return %0 : index
+  util.return %dynamic_size : index
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSizeOfEmpty
 util.func public @denseTensorSizeOfEmpty(%arg0: index) -> index {
-  // CHECK: %[[ZERO:.+]] = arith.constant 0 : index
-  %0 = stream.tensor.sizeof tensor<?x0xf32>{%arg0} : index
-  // CHECK: util.return %[[ZERO]]
-  util.return %0 : index
+  // CHECK: %[[ZERO_SIZE:.+]] = arith.constant 0 : index
+  %zero_size = stream.tensor.sizeof tensor<?x0xf32>{%arg0} : index
+  // CHECK: util.return %[[ZERO_SIZE]]
+  util.return %zero_size : index
 }
 
 // -----
 
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_lhs_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_dynamic_using_layouts
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_lhs_encoding_dynamic(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_dynamic
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_lhs_encoding_partially_dynamic_using_layouts(%arg0: index) -> index {
-  %0 = stream.tensor.sizeof tensor<10x?xf32, #encoding>{%arg0} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_partially_dynamic_using_layouts
-// CHECK-DAG:     %[[C48:.+]] = arith.constant 48 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg0, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D1]], %[[C48]]
-// CHECK:         return %[[T0]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_lhs_encoding_partially_dynamic(%arg0: index) -> index {
-  %0 = stream.tensor.sizeof tensor<10x?xf32, #encoding>{%arg0} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_partially_dynamic
-// CHECK-DAG:     %[[C48:.+]] = arith.constant 48 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg0, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D1]], %[[C48]]
-// CHECK:         return %[[T0]]
-
-// -----
-
-// In GEMM, the RHS has the `(M, N, K) -> (K, N)` layout. The  tile sizes
-// (i.e., [8, 16]) are for [dim_1, dim_0] in the encoding_info, where dim_1 is
-// N-dimension and dim_0 is K-dimension.
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [8, 16], outerDimsPerm = [1, 0]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_rhs_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_rhs_encoding_dynamic_using_layouts
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C16]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_rhs_encoding_dynamic(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_rhs_encoding_dynamic
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C8]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C16]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 8], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_result_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_result_encoding_dynamic_using_layouts
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_result_encoding_dynamic(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_result_encoding_dynamic
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C8]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-// The layout is as the same as the the matmul LHS layout because it broadcasts
-// across the batch dimension. The test is preserved for having the same test
-// suite of non-layouts style encoding. I.e., this is the resolved layout
-// version of the below sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic
-// test.
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic_using_layouts
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-// The M-dimension inner tile is not present because it broadcasts across the
-// M-dimension. We do not need to pack the M-dimension in this case.
-#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [1], innerTileSizes = [16], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
-util.func public @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic_using_layouts
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-//
-// Multiplied by 4 because f32 has 4 bytes.
-//
-// CHECK:         %[[T0:.+]] = arith.muli %arg0, %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array<i64: 4, 8, 16>>
-util.func public @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-//
-// Multiplied by 4 because f32 has 4 bytes.
-//
-// CHECK:         %[[T0:.+]] = arith.muli %arg0, %[[C4]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-// CHECK:         return %[[T1]]
-
-// -----
-
-#encoding_layout_0 = #iree_cpu.cpu_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 8], outerDimsPerm = [0, 1]}}>
-#encoding_layout_1 = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [2, 16], outerDimsPerm = [0, 1]}}>
-#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout_0, #encoding_layout_1]>
-util.func public @sizeof_multi_encoding_layouts(%arg0: index, %arg1: index) -> index {
-  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
-  util.return %0 : index
-}
-// CHECK-LABEL: @sizeof_multi_encoding_layouts
-// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
-//
-// Check for the first layout.
-//
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
-// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[SIZE0:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
-//
-// Check for the first layout.
-//
-// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C2]]
-// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C2]]
-// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
-// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
-// CHECK:         %[[T1:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
-// CHECK:         %[[SIZE1:.+]] = arith.muli %[[T1]], %[[PAD_D1]]
-//
-// Return the max value.
-//
-// CHECK:         %[[RES:.+]] = arith.maxui %[[SIZE0]], %[[SIZE1]]
-// CHECK:         return %[[RES]]
-
-// -----
-
 // CHECK-LABEL: @denseTensorEmpty
 util.func public @denseTensorEmpty(%arg0: index, %arg1: index) -> !stream.resource<*> {
-  // CHECK: %[[RET:.+]] = stream.async.alloca : !stream.resource<*>{%arg1}
-  %0 = stream.tensor.empty : tensor<?x1xf32>{%arg0} in !stream.resource<*>{%arg1}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.alloca : !stream.resource<*>{%arg1}
+  %result = stream.tensor.empty : tensor<?x1xf32>{%arg0} in !stream.resource<*>{%arg1}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -323,10 +35,10 @@ util.func public @denseTensorEmpty(%arg0: index, %arg1: index) -> !stream.resour
 util.func public @denseTensorConstant(%arg0: index) -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 1280 : index
   // CHECK: %[[DYNAMIC_SIZE:.+]] = arith.muli %arg0, %[[STATIC_SIZE]] : index
-  // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[DYNAMIC_SIZE]]} = dense<0.000000e+00> : tensor<1x5x64xf32>
-  %0 = stream.tensor.constant : tensor<?x5x64xf32>{%arg0} in !stream.resource<constant> = dense<0.000000e+00> : tensor<1x5x64xf32>
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<constant>
+  // CHECK: %[[RESULT:.+]] = stream.async.constant : !stream.resource<constant>{%[[DYNAMIC_SIZE]]} = dense<0.000000e+00> : tensor<1x5x64xf32>
+  %result = stream.tensor.constant : tensor<?x5x64xf32>{%arg0} in !stream.resource<constant> = dense<0.000000e+00> : tensor<1x5x64xf32>
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<constant>
 }
 
 // -----
@@ -336,20 +48,20 @@ util.func public @denseTensorConstant(%arg0: index) -> !stream.resource<constant
 // CHECK-LABEL: @denseTensorConstantI1
 util.func public @denseTensorConstantI1() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
-  // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[1, 1, 0, 1]> : tensor<4xi8>
-  %0 = stream.tensor.constant : tensor<4xi1> in !stream.resource<constant> = dense<[true, true, false, true]> : tensor<4xi1>
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<constant>
+  // CHECK: %[[RESULT:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[1, 1, 0, 1]> : tensor<4xi8>
+  %result = stream.tensor.constant : tensor<4xi1> in !stream.resource<constant> = dense<[true, true, false, true]> : tensor<4xi1>
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI32
 util.func public @denseTensorSplatI32(%arg0: i32, %arg1: index, %arg2: index) -> !stream.resource<*> {
-  // CHECK: %[[RET:.+]] = stream.async.splat %arg0 : i32 -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.splat %arg0 : i32 -> tensor<?x1x10xi32>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %arg0 : i32 -> !stream.resource<*>{%arg2}
+  %result = stream.tensor.splat %arg0 : i32 -> tensor<?x1x10xi32>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -357,10 +69,10 @@ util.func public @denseTensorSplatI32(%arg0: i32, %arg1: index, %arg2: index) ->
 // CHECK-LABEL: @denseTensorSplatI1
 util.func public @denseTensorSplatI1(%arg0: i1, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.extui %arg0 : i1 to i8
-  // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i8 -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.splat %arg0 : i1 -> tensor<?x1x10xi1>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %[[PATTERN]] : i8 -> !stream.resource<*>{%arg2}
+  %result = stream.tensor.splat %arg0 : i1 -> tensor<?x1x10xi1>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -368,10 +80,10 @@ util.func public @denseTensorSplatI1(%arg0: i1, %arg1: index, %arg2: index) -> !
 // CHECK-LABEL: @denseTensorSplatBF16
 util.func public @denseTensorSplatBF16(%arg0: bf16, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.bitcast %arg0 : bf16 to i16
-  // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i16 -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.splat %arg0 : bf16 -> tensor<?x1x10xbf16>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %[[PATTERN]] : i16 -> !stream.resource<*>{%arg2}
+  %result = stream.tensor.splat %arg0 : bf16 -> tensor<?x1x10xbf16>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -379,20 +91,20 @@ util.func public @denseTensorSplatBF16(%arg0: bf16, %arg1: index, %arg2: index)
 // CHECK-LABEL: @denseTensorSplatF32
 util.func public @denseTensorSplatF32(%arg0: f32, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.bitcast %arg0 : f32 to i32
-  // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i32 -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.splat %arg0 : f32 -> tensor<?x1x10xf32>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %[[PATTERN]] : i32 -> !stream.resource<*>{%arg2}
+  %result = stream.tensor.splat %arg0 : f32 -> tensor<?x1x10xf32>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI64
 util.func public @denseTensorSplatI64(%arg0: i64, %arg1: index, %arg2: index) -> !stream.resource<*> {
-  // CHECK: %[[RET:.+]] = stream.async.splat %arg0 : i64 -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.splat %arg0 : i64 -> tensor<?x1x10xi64>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %arg0 : i64 -> !stream.resource<*>{%arg2}
+  %result = stream.tensor.splat %arg0 : i64 -> tensor<?x1x10xi64>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -400,25 +112,25 @@ util.func public @denseTensorSplatI64(%arg0: i64, %arg1: index, %arg2: index) ->
 // CHECK-LABEL: @denseTensorSplatConstantComplexF32
 util.func public @denseTensorSplatConstantComplexF32(%arg0: !stream.resource<*>) -> (!stream.resource<*>) {
   %cst = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
-  %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
+  %result_size = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
   // CHECK: %[[I64NUMBER:.+]] = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
   // CHECK: %[[BITCAST:.+]] = complex.bitcast %[[I64NUMBER]] : complex<f32> to i64
-  // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
-  %1 = stream.tensor.splat %cst : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
-  // CHECK: util.return %[[SPLAT_RES]]
-  util.return %1 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %[[BITCAST]]
+  %result = stream.tensor.splat %cst : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%result_size}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatDynamicComplexF32
 util.func public @denseTensorSplatDynamicComplexF32(%arg0: !stream.resource<*>, %arg1: complex<f32>) -> (!stream.resource<*>) {
-  %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
+  %result_size = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
   // CHECK: %[[BITCAST:.+]] = complex.bitcast %arg1 : complex<f32> to i64
-  // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
-  %1 = stream.tensor.splat %arg1 : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
-  // CHECK: util.return %[[SPLAT_RES]]
-  util.return %1 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.splat %[[BITCAST]]
+  %result = stream.tensor.splat %arg1 : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%result_size}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -429,10 +141,12 @@ util.func public @denseTensorSplatDynamicComplexF32(%arg0: !stream.resource<*>,
 util.func public @denseTensorClone(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: f32) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  // CHECK: %[[RET:.+]] = stream.async.clone %arg0 : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg2}
-  %0 = stream.tensor.clone %arg0 : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2}
-  %1 = stream.tensor.fill %arg3, %0[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg1} in %0 as !stream.resource<*>{%arg2}
-  util.return %0, %1 : !stream.resource<*>, !stream.resource<*>
+  // CHECK: %[[CLONE:.+]] = stream.async.clone %arg0 : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg2}
+  %clone = stream.tensor.clone %arg0 : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2}
+  // CHECK: %[[FILL:.+]] = stream.async.fill
+  %fill = stream.tensor.fill %arg3, %clone[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg1} in %0 as !stream.resource<*>{%arg2}
+  // CHECK: util.return %[[CLONE]], %[[FILL]]
+  util.return %clone, %fill : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -443,10 +157,10 @@ util.func public @denseTensorSlice(%arg0: !stream.resource<*>, %arg1: index, %ar
   %c1 = arith.constant 1 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 4 : index
   // CHECK: %[[END:.+]] = arith.addi %arg4, %[[OFFSET]] : index
-  // CHECK: %[[RET:.+]] = stream.async.slice %arg0[%[[OFFSET]] to %[[END]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
-  %0 = stream.tensor.slice %arg0[%c0, %c1 for %arg3, %c1] : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x1xf32>{%arg3} in !stream.resource<*>{%arg4}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.slice %arg0[%[[OFFSET]] to %[[END]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
+  %result = stream.tensor.slice %arg0[%c0, %c1 for %arg3, %c1] : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x1xf32>{%arg3} in !stream.resource<*>{%arg4}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -458,10 +172,10 @@ util.func public @denseTensorFillF32(%arg0: f32, %arg1: !stream.resource<*>, %ar
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 20 : index
   // CHECK-DAG: %[[PATTERN:.+]] = arith.bitcast %arg0 : f32 to i32
-  // CHECK: %[[RET:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i32 -> %arg1 as !stream.resource<*>{%arg3}
-  %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i32 -> %arg1 as !stream.resource<*>{%arg3}
+  %result = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -472,10 +186,10 @@ util.func public @denseTensorFillI64(%arg0: i64, %arg1: !stream.resource<*>, %ar
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 40 : index
-  // CHECK: %[[RET:.+]] = stream.async.fill %arg0, %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
-  %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : i64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.fill %arg0, %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
+  %result = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : i64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -487,10 +201,10 @@ util.func public @denseTensorFillF64(%arg0: f64, %arg1: !stream.resource<*>, %ar
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 40 : index
   // CHECK-DAG: %[[PATTERN:.+]] = arith.bitcast %arg0 : f64 to i64
-  // CHECK: %[[RET:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
-  %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
+  %result = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -500,10 +214,10 @@ util.func public @denseTensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %a
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
-  // CHECK: %[[RET:.+]] = stream.async.update %arg0, %arg2[%[[OFFSET]] to %arg1] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
-  %0 = stream.tensor.update %arg0, %arg2[%c0, %c0] : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<?x4xf32>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<*>
+  // CHECK: %[[RESULT:.+]] = stream.async.update %arg0, %arg2[%[[OFFSET]] to %arg1] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
+  %result = stream.tensor.update %arg0, %arg2[%c0, %c0] : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<?x4xf32>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -512,10 +226,10 @@ util.func public @denseTensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %a
 util.func public @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
-  // CHECK: %[[RET:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg2} -> f32
-  %0 = stream.tensor.load %arg0[%c0] : tensor<?xf32>{%arg1} in !stream.resource<staging>{%arg2} -> f32
-  // CHECK: util.return %[[RET]]
-  util.return %0 : f32
+  // CHECK: %[[RESULT:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg2} -> f32
+  %result = stream.tensor.load %arg0[%c0] : tensor<?xf32>{%arg1} in !stream.resource<staging>{%arg2} -> f32
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : f32
 }
 
 // -----
@@ -524,10 +238,10 @@ util.func public @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index
 util.func public @denseTensorLoadRank0(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
-  // CHECK: %[[RET:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg1} -> f32
-  %0 = stream.tensor.load %arg0 : tensor<f32> in !stream.resource<staging>{%arg1} -> f32
-  // CHECK: util.return %[[RET]]
-  util.return %0 : f32
+  // CHECK: %[[RESULT:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg1} -> f32
+  %result = stream.tensor.load %arg0 : tensor<f32> in !stream.resource<staging>{%arg1} -> f32
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : f32
 }
 
 // -----
@@ -536,10 +250,10 @@ util.func public @denseTensorLoadRank0(%arg0: !stream.resource<staging>, %arg1:
 util.func public @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
-  // CHECK: %[[RET:.+]] = stream.async.store %arg3, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg2}
-  %0 = stream.tensor.store %arg3, %arg0[%c0] : f32 -> tensor<?xf32>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<staging>
+  // CHECK: %[[RESULT:.+]] = stream.async.store %arg3, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg2}
+  %result = stream.tensor.store %arg3, %arg0[%c0] : f32 -> tensor<?xf32>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<staging>
 }
 
 // -----
@@ -548,8 +262,27 @@ util.func public @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: inde
 util.func public @denseTensorStoreRank0(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
-  // CHECK: %[[RET:.+]] = stream.async.store %arg2, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
-  %0 = stream.tensor.store %arg2, %arg0 : f32 -> tensor<f32> in %arg0 as !stream.resource<staging>{%arg1}
-  // CHECK: util.return %[[RET]]
-  util.return %0 : !stream.resource<staging>
+  // CHECK: %[[RESULT:.+]] = stream.async.store %arg2, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
+  %result = stream.tensor.store %arg2, %arg0 : f32 -> tensor<f32> in %arg0 as !stream.resource<staging>{%arg1}
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<staging>
+}
+
+// -----
+
+// CHECK-LABEL: @denseTensorDispatch
+// CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<transient>, %[[RESOURCE0_SIZE:[a-z0-9]+]]: index, %[[TENSOR0_DIM:[a-z0-9]+]]: index,
+// CHECK-SAME:  %[[RESOURCE1:.+]]: !stream.resource<external>, %[[RESOURCE1_SIZE:[a-z0-9]+]]: index, %[[TENSOR1_DIM:[a-z0-9]+]]: index)
+util.func public @denseTensorDispatch(
+    %resource0: !stream.resource<transient>, %resource0_size: index, %tensor0_dim: index,
+    %resource1: !stream.resource<external>, %resource1_size: index, %tensor1_dim: index) -> (!stream.resource<external>, !stream.resource<external>) {
+  // CHECK: %[[ZERO:.+]] = arith.constant 0
+  // CHECK: %[[RESULTS:.+]]:2 = stream.async.dispatch @ex::@entry
+  // CHECK-SAME: (%[[RESOURCE0]][%[[ZERO]] to %[[RESOURCE0_SIZE]] for %[[RESOURCE0_SIZE]]],
+  // CHECK-SAME:  %[[RESOURCE1]][%[[ZERO]] to %[[RESOURCE1_SIZE]] for %[[RESOURCE1_SIZE]]])
+  // CHECK-SAME: (!stream.resource<transient>{%[[RESOURCE0_SIZE]]}, !stream.resource<external>{%[[RESOURCE1_SIZE]]}) ->
+  // CHECK-SAME: (!stream.resource<external>{%[[RESOURCE1_SIZE]]}, %[[RESOURCE1]]{%[[RESOURCE1_SIZE]]})
+  %results:2 = stream.tensor.dispatch @ex::@entry(%resource0, %resource1) : (tensor<4x?xf32>{%tensor0_dim} in !stream.resource<transient>{%resource0_size}, tensor<?xi32>{%tensor1_dim} in !stream.resource<external>{%resource1_size}) -> (tensor<?xi32>{%tensor1_dim} in !stream.resource<external>{%resource1_size}, tensor<?xf32>{%tensor1_dim} in %resource1{%resource1_size})
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULTS]]#1
+  util.return %results#0, %results#1 : !stream.resource<external>, !stream.resource<external>
 }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_encoding.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_encoding.mlir
new file mode 100644
index 000000000000..8d670ebd6d1c
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_encoding.mlir
@@ -0,0 +1,307 @@
+// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors %s | FileCheck %s
+
+// CHECK-LABEL: @tensorSizeOfUnalignedPackedI1
+util.func public @tensorSizeOfUnalignedPackedI1() -> index {
+  // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
+  %0 = stream.tensor.sizeof tensor<12xi1, #iree_encoding.packed_storage> : index
+  // CHECK: return %[[C2]] : index
+  util.return %0 : index
+}
+
+// -----
+
+// CHECK-LABEL: @tensorSizeOfAlignedPackedI1
+util.func public @tensorSizeOfAlignedPackedI1() -> index {
+  // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
+  %0 = stream.tensor.sizeof tensor<24xi1, #iree_encoding.packed_storage> : index
+  // CHECK: util.return %[[C3]] : index
+  util.return %0 : index
+}
+
+// -----
+
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_lhs_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_dynamic_using_layouts
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_lhs_encoding_dynamic(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_dynamic
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_lhs_encoding_partially_dynamic_using_layouts(%arg0: index) -> index {
+  %0 = stream.tensor.sizeof tensor<10x?xf32, #encoding>{%arg0} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_partially_dynamic_using_layouts
+// CHECK-DAG:     %[[C48:.+]] = arith.constant 48 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg0, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D1]], %[[C48]]
+// CHECK:         return %[[T0]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_lhs_encoding_partially_dynamic(%arg0: index) -> index {
+  %0 = stream.tensor.sizeof tensor<10x?xf32, #encoding>{%arg0} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_partially_dynamic
+// CHECK-DAG:     %[[C48:.+]] = arith.constant 48 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg0, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D1]], %[[C48]]
+// CHECK:         return %[[T0]]
+
+// -----
+
+// In GEMM, the RHS has the `(M, N, K) -> (K, N)` layout. The  tile sizes
+// (i.e., [8, 16]) are for [dim_1, dim_0] in the encoding_info, where dim_1 is
+// N-dimension and dim_0 is K-dimension.
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [8, 16], outerDimsPerm = [1, 0]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_rhs_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_rhs_encoding_dynamic_using_layouts
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C16]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_rhs_encoding_dynamic(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_rhs_encoding_dynamic
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C8]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C16]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 8], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_result_encoding_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_result_encoding_dynamic_using_layouts
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_result_encoding_dynamic(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_result_encoding_dynamic
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C8]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+// The layout is as the same as the the matmul LHS layout because it broadcasts
+// across the batch dimension. The test is preserved for having the same test
+// suite of non-layouts style encoding. I.e., this is the resolved layout
+// version of the below sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic
+// test.
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 16], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic_using_layouts
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivui %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+// The M-dimension inner tile is not present because it broadcasts across the
+// M-dimension. We do not need to pack the M-dimension in this case.
+#encoding_layout = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [1], innerTileSizes = [16], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout]>
+util.func public @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic_using_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic_using_layouts
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+//
+// Multiplied by 4 because f32 has 4 bytes.
+//
+// CHECK:         %[[T0:.+]] = arith.muli %arg0, %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array<i64: 4, 8, 16>>
+util.func public @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivui %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+//
+// Multiplied by 4 because f32 has 4 bytes.
+//
+// CHECK:         %[[T0:.+]] = arith.muli %arg0, %[[C4]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+// CHECK:         return %[[T1]]
+
+// -----
+
+#encoding_layout_0 = #iree_cpu.cpu_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [4, 8], outerDimsPerm = [0, 1]}}>
+#encoding_layout_1 = #iree_cpu.vmvx_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [2, 16], outerDimsPerm = [0, 1]}}>
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], layouts = [#encoding_layout_0, #encoding_layout_1]>
+util.func public @sizeof_multi_encoding_layouts(%arg0: index, %arg1: index) -> index {
+  %0 = stream.tensor.sizeof tensor<?x?xf32, #encoding>{%arg0, %arg1} : index
+  util.return %0 : index
+}
+// CHECK-LABEL: @sizeof_multi_encoding_layouts
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+//
+// Check for the first layout.
+//
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C4]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C4]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C8]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C8]]
+// CHECK:         %[[T0:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[SIZE0:.+]] = arith.muli %[[T0]], %[[PAD_D1]]
+//
+// Check for the first layout.
+//
+// CHECK:         %[[CEIL_DIV_D0:.+]] = arith.ceildivsi %arg0, %[[C2]]
+// CHECK:         %[[PAD_D0:.+]] = arith.muli %[[CEIL_DIV_D0]], %[[C2]]
+// CHECK:         %[[CEIL_DIV_D1:.+]] = arith.ceildivsi %arg1, %[[C16]]
+// CHECK:         %[[PAD_D1:.+]] = arith.muli %[[CEIL_DIV_D1]], %[[C16]]
+// CHECK:         %[[T1:.+]] = arith.muli %[[PAD_D0]], %[[C4]]
+// CHECK:         %[[SIZE1:.+]] = arith.muli %[[T1]], %[[PAD_D1]]
+//
+// Return the max value.
+//
+// CHECK:         %[[RES:.+]] = arith.maxui %[[SIZE0]], %[[SIZE1]]
+// CHECK:         return %[[RES]]
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_attr.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_attr.mlir
deleted file mode 100644
index eefc9810aed5..000000000000
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_attr.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors %s | FileCheck %s
-
-#packed = #iree_encoding.packed_storage
-func.func @unaligned_i1_size() -> index {
-  %0 = stream.tensor.sizeof tensor<12xi1, #packed> : index
-  return %0 : index
-}
-// CHECK: func @unaligned_i1_size() -> index {
-// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
-// CHECK: return %[[C2]] : index
-
-// -----
-
-#packed = #iree_encoding.packed_storage
-func.func @aligned_i1_size() -> index {
-  %0 = stream.tensor.sizeof tensor<24xi1, #packed> : index
-  return %0 : index
-}
-
-// CHECK: func @aligned_i1_size() -> index {
-// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
-// CHECK: return %[[C3]] : index
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir
index 12527ae139b1..c96e05270d12 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir
@@ -1,20 +1,23 @@
+// This is only used to test the experimental packing flag. When the default
+// is changed the encode_host_tensors.mlir test should be updated and used
+// instead and this file should be deleted.
+
 // RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s
 
-func.func @unaligned_i1_size() -> index {
+// CHECK-LABEL: @tensorSizeOfUnalignedPackedI1
+util.func @tensorSizeOfUnalignedPackedI1() -> index {
+  // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
   %0 = stream.tensor.sizeof tensor<12xi1> : index
-  return %0 : index
+  // CHECK: return %[[C2]] : index
+  util.return %0 : index
 }
-// CHECK: func @unaligned_i1_size() -> index {
-// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
-// CHECK: return %[[C2]] : index
 
 // -----
 
-func.func @aligned_i1_size() -> index {
+// CHECK-LABEL: @tensorSizeOfAlignedPackedI1
+util.func @tensorSizeOfAlignedPackedI1() -> index {
+  // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
   %0 = stream.tensor.sizeof tensor<24xi1> : index
-  return %0 : index
+  // CHECK: util.return %[[C3]] : index
+  util.return %0 : index
 }
-
-// CHECK: func @aligned_i1_size() -> index {
-// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
-// CHECK: return %[[C3]] : index
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp
index 410252bab913..c9e470fd443e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp
@@ -69,11 +69,8 @@ ArrayAttr deduplicateArrayElements(ArrayAttr arrayAttr) {
   return ArrayAttr::get(arrayAttr.getContext(), attrsSet.takeVector());
 }
 
-// Finds the operand index in |operands| that |tiedResult| references.
-// Returns TiedOpInterface::kUntiedIndex if no operand is found.
-static int64_t
-findTiedOperand(OpAsmParser::UnresolvedOperand tiedResult,
-                ArrayRef<OpAsmParser::UnresolvedOperand> operands) {
+int64_t findTiedOperand(OpAsmParser::UnresolvedOperand tiedResult,
+                        ArrayRef<OpAsmParser::UnresolvedOperand> operands) {
   int64_t operandIndex = IREE::Util::TiedOpInterface::kUntiedIndex;
   for (int64_t i = 0; i < operands.size(); ++i) {
     if (operands[i].name == tiedResult.name &&
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.h b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.h
index 1623b8e9a5bb..c0bdbce54a0c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.h
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.h
@@ -48,6 +48,11 @@ Value buildIfElseTree(
 // Removes duplicate attributes in the array (if any).
 ArrayAttr deduplicateArrayElements(ArrayAttr arrayAttr);
 
+// Finds the operand index in |operands| that |tiedResult| references.
+// Returns TiedOpInterface::kUntiedIndex if no operand is found.
+int64_t findTiedOperand(OpAsmParser::UnresolvedOperand tiedResult,
+                        ArrayRef<OpAsmParser::UnresolvedOperand> operands);
+
 //===----------------------------------------------------------------------===//
 // custom<SymbolVisibility>($sym_visibility)
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.td b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.td
index ea1f222ad5d4..146e22c11c88 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.td
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.td
@@ -183,6 +183,26 @@ def Util_ObjectType : TypeDef<Util_Dialect, "Object"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// !util.unused
+//===----------------------------------------------------------------------===//
+
+def Util_UnusedType : TypeDef<Util_Dialect, "Unused"> {
+  let mnemonic = "unused";
+
+  let summary = [{a placeholder for unused types}];
+  let description = [{
+    An unused type placeholder used to satisfy verifiers that may require a
+    type even if unused.
+  }];
+
+  let builders = [
+    TypeBuilder<(ins), [{
+      return $_get($_ctxt);
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // !util.variant
 //===----------------------------------------------------------------------===//