diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index a5925f4aeaae..cd2876eb25e8 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -286,8 +286,6 @@ static void printDispatchWorkgroupsCountRegion(OpAsmPrinter &p, Operation *op,
 // flow.dispatch.region
 //===----------------------------------------------------------------------===//
 
-// Verifies the workgroup count
-
 static LogicalResult
 verifyWorkgroupCountRegion(Operation *op, ValueRange workload, Region &region) {
   // Verify the workload operands match the expected capture args.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
new file mode 100644
index 000000000000..1d5d4e61e6b3
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
@@ -0,0 +1,378 @@
+// Copyright 2019 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <utility>
+
+#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
+#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
+#include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
+#include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "iree/compiler/Utils/StringUtils.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "iree-dispatch"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace Flow {
+namespace {
+
+static int64_t costOfDomain(ArrayRef<int64_t> domain) {
+  int64_t product = 1;
+  for (int64_t size : domain) {
+    if (size == mlir::ShapedType::kDynamic)
+      return INT64_MAX;
+    product *= size;
+  }
+  return product;
+};
+
+// Estimates the evaluation cost of a linalg op using a heuristic cost model.
+static int64_t estimateLinalgOpCost(linalg::LinalgOp op) {
+  // For linalg ops we know the iteration domain, so return the number
+  // of iterations of the iteration domain (or INT64_MAX for dynamic.)
+  int64_t cost = costOfDomain(op.getStaticLoopRanges());
+  LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost
+                          << "\n");
+  return cost;
+}
+
+static TensorType getMainTensorForLinalgExtOp(Operation *op) {
+  TensorType main;
+  auto operandTypes = llvm::to_vector(op->getOperandTypes());
+  auto resultTypes = llvm::to_vector(op->getResultTypes());
+  for (Type t : llvm::concat<Type>(operandTypes, resultTypes)) {
+    auto tensorType = llvm::dyn_cast<TensorType>(t);
+    if (!tensorType)
+      continue;
+    if (!main) {
+      main = tensorType;
+    } else if (costOfDomain(tensorType.getShape()) >
+               costOfDomain(main.getShape())) {
+      main = tensorType;
+    }
+  }
+  return main;
+}
+
+// Estimates the evaluation cost of a LinalgExt op using a heuristic cost
+// model.
+static int64_t estimateLinalgExtOpCost(Operation *op) {
+  TensorType mainTensor = getMainTensorForLinalgExtOp(op);
+  // Use the cost of the biggest tensor of the LinalgExt op as an approximation.
+  // This is a very, very coarse approximation.
+  auto cost = mainTensor ? costOfDomain(mainTensor.getShape()) : 1;
+  // Multiply by a semi-arbitrarily chosen factor to capture that LinalgExt ops
+  // are "somewhat more expensive" than simply traversing the main tensor.
+  // This is something like the extra log(N) factor for a sort or FFT, or
+  // the amount of work done by a softmax vs a cheap elementwise on a tensor
+  // of the same shape.
+  cost *= 10;
+  LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost
+                          << "\n");
+  return cost;
+}
+
+// Estimates the evaluation cost of a Linalg::Softmax op using a heuristic cost
+// model similar to LinalgExt ops.
+static int64_t estimateLinalgSoftmaxOpCost(Operation *op) {
+  return estimateLinalgExtOpCost(op);
+}
+
+// Returns a string like "512xDx128" representing loop ranges.
+static std::string loopRangesToString(ArrayRef<int64_t> loopRanges) {
+  std::string outputString;
+  llvm::raw_string_ostream sstream(outputString);
+  llvm::interleave(
+      loopRanges,
+      [&](int64_t loopRange) {
+        // Note: normally we'd use '?', but that isn't a valid character for
+        // function names on a variety of targets, so we stick to [a-Z0-9_]
+        // characters.
+        sstream << (ShapedType::isDynamic(loopRange) ? "D"
+                                                     : llvm::itostr(loopRange));
+      },
+      [&] { sstream << "x"; });
+  return outputString;
+}
+
+static std::string operandTypeToString(Value operandValue) {
+  auto operandType = operandValue.getType();
+  std::string outputString;
+  llvm::raw_string_ostream sstream(outputString);
+  if (auto shapedType = dyn_cast<ShapedType>(operandType)) {
+    shapedType.getElementType().print(sstream);
+  } else {
+    operandType.print(sstream);
+  }
+  return outputString;
+}
+
+// Returns a string like "f32xi32xf16" representing a linalg op's types for each
+// operands. Will collapse to single type if all match.
+static std::string getLinalgDataTypes(linalg::LinalgOp op) {
+  std::string firstToken = "";
+  bool allTokensSame = true;
+  SmallVector<std::string> datatypeTokens;
+
+  for (Value operandValue : op->getOperands()) {
+    datatypeTokens.push_back(operandTypeToString(operandValue));
+    if (firstToken.empty()) {
+      firstToken = operandTypeToString(operandValue);
+    } else if (allTokensSame) {
+      allTokensSame = firstToken == operandTypeToString(operandValue);
+    }
+  }
+
+  if (allTokensSame) {
+    return firstToken;
+  } else {
+    std::string outputString;
+    llvm::raw_string_ostream sstream(outputString);
+    llvm::interleave(
+        datatypeTokens, [&](std::string token) { sstream << token; },
+        [&] { sstream << "x"; });
+    return outputString;
+  }
+}
+
+/// Returns the op name without dialect name. E.g., it returns "set_encoding" if
+/// the input operation is iree_linalg_ext.set_encoding.
+static std::string getOpNameWithoutDialectName(Operation *op) {
+  auto opName =
+      op->getName().getStringRef().drop_until([](char c) { return c == '.'; });
+  if (opName.starts_with("."))
+    opName = opName.drop_front();
+  return opName.str();
+}
+
+static std::string summarizeLinalgOp(linalg::LinalgOp op) {
+  auto opName = op->getName().getStringRef();
+  if (!opName.consume_front("linalg."))
+    return "";
+  std::string opLoopRanges = loopRangesToString(op.getStaticLoopRanges());
+  std::string opTypes = opLoopRanges.empty() ? "" : getLinalgDataTypes(op);
+  return opName.str() + (opLoopRanges.empty() ? "" : "_" + opLoopRanges) +
+         (opTypes.empty() ? "" : "_" + opTypes);
+}
+
+static std::string summarizeLinalgExtOp(Operation *op) {
+  auto opName = op->getName().getStringRef();
+  // Currently, this utility is also invoked by Linalg::SoftmaxOp.
+  if (!(opName.consume_front("iree_linalg_ext.") ||
+        opName.consume_front("linalg.")))
+    return "";
+  std::string suffix = "";
+  if (TensorType mainTensor = getMainTensorForLinalgExtOp(op)) {
+    llvm::raw_string_ostream sstream(suffix);
+    sstream << "_";
+    sstream << loopRangesToString(mainTensor.getShape());
+    sstream << "x";
+    mainTensor.getElementType().print(sstream);
+    sstream.flush();
+  }
+  return opName.str() + suffix;
+}
+
+// Summarizes the contents of a dispatch into a short string.
+// This uses heuristics to aid developer debugging.
+static std::string summarizeDispatchRegion(Region &region) {
+  // The goal here is to build a relatively concise description that gives
+  // enough information to developers to see roughly what sort of computation a
+  // dispatch region performs. Multiple approaches are valid here, depending on
+  // what a developer wants to highlight.
+  //
+  // Currently, this uses a cost model to estimate which individual operation
+  // is the most computationally expensive, then a summary is generated which
+  // includes some of that operation's parameters.
+  //
+  // Other metrics to determine which single op is the "best" or which list of
+  // ops is most interesting (e.g. to highlight large data movements) could be
+  // used instead.
+
+  Operation *bestOp = NULL;
+  const int64_t kMinEstimatedCost = -1;
+  int64_t bestEstimatedCost = kMinEstimatedCost;
+  region.walk([&](Operation *op) {
+    TypeSwitch<Operation *>(op)
+        .Case<linalg::SoftmaxOp>([&](auto op) {
+          int64_t estimatedCost = estimateLinalgSoftmaxOpCost(op);
+          if (estimatedCost < bestEstimatedCost)
+            return;
+          bestEstimatedCost = estimatedCost;
+          bestOp = op;
+          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
+                                  << "', cost: " << bestEstimatedCost << "\n");
+        })
+        .Case<linalg::LinalgOp>([&](auto op) {
+          int64_t estimatedCost = estimateLinalgOpCost(op);
+          if (estimatedCost < bestEstimatedCost)
+            return;
+          bestEstimatedCost = estimatedCost;
+          bestOp = op;
+          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
+                                  << "', cost: " << bestEstimatedCost << "\n");
+        })
+        .Case<IREE::LinalgExt::SetEncodingOp, IREE::LinalgExt::UnsetEncodingOp>(
+            [&](auto op) {
+              // SetEncoding/UnsetEncoding is the bestOp only if there are no
+              // other operations.
+              int64_t estimatedCost = kMinEstimatedCost + 1;
+              if (estimatedCost < bestEstimatedCost)
+                return;
+              bestEstimatedCost = estimatedCost;
+              bestOp = op;
+              LLVM_DEBUG(llvm::dbgs()
+                         << "// new best op: '" << bestOp->getName()
+                         << "', cost: " << bestEstimatedCost << "\n");
+            })
+        .Case<IREE::LinalgExt::LinalgExtOp>([&](auto op) {
+          int64_t estimatedCost = estimateLinalgExtOpCost(op);
+          if (estimatedCost < bestEstimatedCost)
+            return;
+          bestEstimatedCost = estimatedCost;
+          bestOp = op;
+          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
+                                  << "', cost: " << bestEstimatedCost << "\n");
+        })
+        .Default([&](Operation *op) {
+          // No cost estimation implemented, skip.
+        });
+  });
+
+  if (!bestOp) {
+    std::string bestSummary = "";
+    // Check if there is a possible slow memory copy as a dispatch. The current
+    // heuristic is to check if a dispatch.tensor.store stores a tensor that is
+    // directly loaded from a dispatch.tensor.load.
+    region.walk([&](IREE::Flow::DispatchTensorStoreOp storeOp) {
+      Value input = storeOp.getValue();
+      if (auto loadOp =
+              input.getDefiningOp<IREE::Flow::DispatchTensorLoadOp>()) {
+        bestSummary = "slow_memcpy";
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+    return bestSummary;
+  }
+
+  std::string bestSummary = "";
+  TypeSwitch<Operation *>(bestOp)
+      .Case<linalg::SoftmaxOp>(
+          [&](auto op) { bestSummary = summarizeLinalgExtOp(op); })
+      .Case<linalg::LinalgOp>(
+          [&](auto op) { bestSummary = summarizeLinalgOp(op); })
+      .Case<IREE::LinalgExt::SetEncodingOp>([&](auto op) {
+        auto opName = getOpNameWithoutDialectName(op);
+        auto encoding = op.getResultType()
+                            .getEncoding()
+                            .template cast<IREE::LinalgExt::EncodingAttr>();
+        auto user = stringifyEnum(encoding.getUser().getValue());
+        auto role = stringifyEnum(encoding.getRole().getValue());
+        ArrayRef<int64_t> shape = op.getSourceType().getShape();
+        bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" +
+                      loopRangesToString(shape);
+        ;
+      })
+      .Case<IREE::LinalgExt::UnsetEncodingOp>([&](auto op) {
+        auto opName = getOpNameWithoutDialectName(op);
+        auto encoding = op.getSourceType()
+                            .getEncoding()
+                            .template cast<IREE::LinalgExt::EncodingAttr>();
+        auto user = stringifyEnum(encoding.getUser().getValue());
+        auto role = stringifyEnum(encoding.getRole().getValue());
+        ArrayRef<int64_t> shape = op.getResultType().getShape();
+        bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" +
+                      loopRangesToString(shape);
+      })
+      .Case<IREE::LinalgExt::LinalgExtOp>(
+          [&](auto op) { bestSummary = summarizeLinalgExtOp(op); })
+      .Default([&](Operation *op) {
+        // No summarization implemented, default to the op's name.
+        bestSummary = op->getName().getStringRef().str();
+      });
+
+  // Sanitize the string so that it contains only C literal-compatible chars.
+  bestSummary = sanitizeSymbolName(bestSummary);
+
+  LLVM_DEBUG(llvm::dbgs() << "// best op summary: '" << bestSummary << "'\n");
+  return bestSummary;
+}
+
+} // namespace
+
+class AnnotateDispatchesPass
+    : public AnnotateDispatchesBase<AnnotateDispatchesPass> {
+public:
+  AnnotateDispatchesPass() = default;
+
+  void runOnOperation() override {
+    DenseMap<Attribute, SymbolRefAttr> entryPointRefReplacements;
+    for (auto executableOp :
+         getOperation().getBody()->getOps<IREE::Flow::ExecutableOp>()) {
+      // Rename each export op.
+      for (auto exportOp :
+           executableOp.getBlock().getOps<ExecutableExportOp>()) {
+        auto oldSymbolRefAttr = SymbolRefAttr::get(
+            &getContext(), executableOp.getName(),
+            {SymbolRefAttr::get(&getContext(), exportOp.getSymName())});
+
+        auto funcOp =
+            executableOp.getInnerModule().lookupSymbol<FunctionOpInterface>(
+                exportOp.getFunctionRef());
+        if (!funcOp)
+          continue; // extern module, maybe
+        std::string summary = summarizeDispatchRegion(funcOp.getFunctionBody());
+        if (summary.empty())
+          continue; // unable to tell
+
+        std::string newName = funcOp.getName().str() + "_" + summary;
+
+        exportOp.setSymName(newName);
+        exportOp.setFunctionRef(newName);
+        funcOp.setName(newName);
+
+        auto newSymbolRefAttr =
+            SymbolRefAttr::get(&getContext(), executableOp.getName(),
+                               {SymbolRefAttr::get(&getContext(), newName)});
+        entryPointRefReplacements[oldSymbolRefAttr] = newSymbolRefAttr;
+      }
+    }
+
+    // Replace each usage of an entry point with its original symbol name with a
+    // new symbol name.
+    for (auto funcLikeOp : getOperation().getOps<FunctionOpInterface>()) {
+      funcLikeOp->walk([&](IREE::Flow::DispatchOp dispatchOp) {
+        auto it = entryPointRefReplacements.find(dispatchOp.getEntryPoint());
+        if (it != entryPointRefReplacements.end()) {
+          dispatchOp.setEntryPointAttr(llvm::cast<SymbolRefAttr>(it->second));
+        }
+      });
+    }
+  }
+};
+
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createAnnotateDispatchesPass() {
+  return std::make_unique<AnnotateDispatchesPass>();
+}
+
+} // namespace Flow
+} // namespace IREE
+} // namespace iree_compiler
+} // namespace mlir
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel
index a3ad95f80049..1fbf965b3c40 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel
@@ -30,6 +30,7 @@ iree_gentbl_cc_library(
 iree_compiler_cc_library(
     name = "Transforms",
     srcs = [
+        "AnnotateDispatches.cpp",
         "CaptureDispatchDynamicDims.cpp",
         "CleanupNumericNarrowing.cpp",
         "CleanupTensorShapes.cpp",
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
index c3b86ccf1eef..3d931217c0fa 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
@@ -29,6 +29,7 @@ iree_cc_library(
     "Passes.h.inc"
     "RegionOpUtils.h"
   SRCS
+    "AnnotateDispatches.cpp"
     "CaptureDispatchDynamicDims.cpp"
     "CleanupNumericNarrowing.cpp"
     "CleanupTensorShapes.cpp"
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
index c472fd9ccf69..f90df6aa4884 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
@@ -6,17 +6,12 @@
 
 #include <utility>
 
-#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
-#include "iree/compiler/Utils/StringUtils.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
@@ -24,299 +19,12 @@
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
 
-#define DEBUG_TYPE "iree-dispatch"
-
 namespace mlir {
 namespace iree_compiler {
 namespace IREE {
 namespace Flow {
 namespace {
 
-static int64_t costOfDomain(ArrayRef<int64_t> domain) {
-  int64_t product = 1;
-  for (int64_t size : domain) {
-    if (size == mlir::ShapedType::kDynamic)
-      return INT64_MAX;
-    product *= size;
-  }
-  return product;
-};
-
-// Estimates the evaluation cost of a linalg op using a heuristic cost model.
-static int64_t estimateLinalgOpCost(linalg::LinalgOp op) {
-  // For linalg ops we know the iteration domain, so return the number
-  // of iterations of the iteration domain (or INT64_MAX for dynamic.)
-  int64_t cost = costOfDomain(op.getStaticLoopRanges());
-  LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost
-                          << "\n");
-  return cost;
-}
-
-static TensorType getMainTensorForLinalgExtOp(Operation *op) {
-  TensorType main;
-  auto operandTypes = llvm::to_vector(op->getOperandTypes());
-  auto resultTypes = llvm::to_vector(op->getResultTypes());
-  for (Type t : llvm::concat<Type>(operandTypes, resultTypes)) {
-    auto tensorType = llvm::dyn_cast<TensorType>(t);
-    if (!tensorType)
-      continue;
-    if (!main) {
-      main = tensorType;
-    } else if (costOfDomain(tensorType.getShape()) >
-               costOfDomain(main.getShape())) {
-      main = tensorType;
-    }
-  }
-  return main;
-}
-
-// Estimates the evaluation cost of a LinalgExt op using a heuristic cost
-// model.
-static int64_t estimateLinalgExtOpCost(Operation *op) {
-  TensorType mainTensor = getMainTensorForLinalgExtOp(op);
-  // Use the cost of the biggest tensor of the LinalgExt op as an approximation.
-  // This is a very, very coarse approximation.
-  auto cost = mainTensor ? costOfDomain(mainTensor.getShape()) : 1;
-  // Multiply by a semi-arbitrarily chosen factor to capture that LinalgExt ops
-  // are "somewhat more expensive" than simply traversing the main tensor.
-  // This is something like the extra log(N) factor for a sort or FFT, or
-  // the amount of work done by a softmax vs a cheap elementwise on a tensor
-  // of the same shape.
-  cost *= 10;
-  LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost
-                          << "\n");
-  return cost;
-}
-
-// Estimates the evaluation cost of a Linalg::Softmax op using a heuristic cost
-// model similar to LinalgExt ops.
-static int64_t estimateLinalgSoftmaxOpCost(Operation *op) {
-  return estimateLinalgExtOpCost(op);
-}
-
-// Returns a string like "512xDx128" representing loop ranges.
-static std::string loopRangesToString(ArrayRef<int64_t> loopRanges) {
-  std::string outputString;
-  llvm::raw_string_ostream sstream(outputString);
-  llvm::interleave(
-      loopRanges,
-      [&](int64_t loopRange) {
-        // Note: normally we'd use '?', but that isn't a valid character for
-        // function names on a variety of targets, so we stick to [a-Z0-9_]
-        // characters.
-        sstream << (ShapedType::isDynamic(loopRange) ? "D"
-                                                     : llvm::itostr(loopRange));
-      },
-      [&] { sstream << "x"; });
-  return outputString;
-}
-
-static std::string operandTypeToString(Value operandValue) {
-  auto operandType = operandValue.getType();
-  std::string outputString;
-  llvm::raw_string_ostream sstream(outputString);
-  if (auto shapedType = dyn_cast<ShapedType>(operandType)) {
-    shapedType.getElementType().print(sstream);
-  } else {
-    operandType.print(sstream);
-  }
-  return outputString;
-}
-
-// Returns a string like "f32xi32xf16" representing a linalg op's types for each
-// operands. Will collapse to single type if all match.
-static std::string getLinalgDataTypes(linalg::LinalgOp op) {
-  std::string firstToken = "";
-  bool allTokensSame = true;
-  SmallVector<std::string> datatypeTokens;
-
-  for (Value operandValue : op->getOperands()) {
-    datatypeTokens.push_back(operandTypeToString(operandValue));
-    if (firstToken.empty()) {
-      firstToken = operandTypeToString(operandValue);
-    } else if (allTokensSame) {
-      allTokensSame = firstToken == operandTypeToString(operandValue);
-    }
-  }
-
-  if (allTokensSame) {
-    return firstToken;
-  } else {
-    std::string outputString;
-    llvm::raw_string_ostream sstream(outputString);
-    llvm::interleave(
-        datatypeTokens, [&](std::string token) { sstream << token; },
-        [&] { sstream << "x"; });
-    return outputString;
-  }
-}
-
-/// Returns the op name without dialect name. E.g., it returns "set_encoding" if
-/// the input operation is iree_linalg_ext.set_encoding.
-static std::string getOpNameWithoutDialectName(Operation *op) {
-  auto opName =
-      op->getName().getStringRef().drop_until([](char c) { return c == '.'; });
-  if (opName.starts_with("."))
-    opName = opName.drop_front();
-  return opName.str();
-}
-
-static std::string summarizeLinalgOp(linalg::LinalgOp op) {
-  auto opName = op->getName().getStringRef();
-  if (!opName.consume_front("linalg."))
-    return "";
-  std::string opLoopRanges = loopRangesToString(op.getStaticLoopRanges());
-  std::string opTypes = opLoopRanges.empty() ? "" : getLinalgDataTypes(op);
-  return opName.str() + (opLoopRanges.empty() ? "" : "_" + opLoopRanges) +
-         (opTypes.empty() ? "" : "_" + opTypes);
-}
-
-static std::string summarizeLinalgExtOp(Operation *op) {
-  auto opName = op->getName().getStringRef();
-  // Currently, this utility is also invoked by Linalg::SoftmaxOp.
-  if (!(opName.consume_front("iree_linalg_ext.") ||
-        opName.consume_front("linalg.")))
-    return "";
-  std::string suffix = "";
-  if (TensorType mainTensor = getMainTensorForLinalgExtOp(op)) {
-    llvm::raw_string_ostream sstream(suffix);
-    sstream << "_";
-    sstream << loopRangesToString(mainTensor.getShape());
-    sstream << "x";
-    mainTensor.getElementType().print(sstream);
-    sstream.flush();
-  }
-  return opName.str() + suffix;
-}
-
-// Summarizes the contents of a dispatch into a short string.
-// This uses heuristics to aid developer debugging.
-static std::string
-summarizeDispatchWorkgroupsOp(DispatchWorkgroupsOp regionOp) {
-  // The goal here is to build a relatively concise description that gives
-  // enough information to developers to see roughly what sort of computation a
-  // dispatch region performs. Multiple approaches are valid here, depending on
-  // what a developer wants to highlight.
-  //
-  // Currently, this uses a cost model to estimate which individual operation
-  // is the most computationally expensive, then a summary is generated which
-  // includes some of that operation's parameters.
-  //
-  // Other metrics to determine which single op is the "best" or which list of
-  // ops is most interesting (e.g. to highlight large data movements) could be
-  // used instead.
-
-  Operation *bestOp = NULL;
-  const int64_t kMinEstimatedCost = -1;
-  int64_t bestEstimatedCost = kMinEstimatedCost;
-  regionOp.getWorkgroupBody().walk([&](Operation *op) {
-    TypeSwitch<Operation *>(op)
-        .Case<linalg::SoftmaxOp>([&](auto op) {
-          int64_t estimatedCost = estimateLinalgSoftmaxOpCost(op);
-          if (estimatedCost < bestEstimatedCost)
-            return;
-          bestEstimatedCost = estimatedCost;
-          bestOp = op;
-          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
-                                  << "', cost: " << bestEstimatedCost << "\n");
-        })
-        .Case<linalg::LinalgOp>([&](auto op) {
-          int64_t estimatedCost = estimateLinalgOpCost(op);
-          if (estimatedCost < bestEstimatedCost)
-            return;
-          bestEstimatedCost = estimatedCost;
-          bestOp = op;
-          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
-                                  << "', cost: " << bestEstimatedCost << "\n");
-        })
-        .Case<IREE::LinalgExt::SetEncodingOp, IREE::LinalgExt::UnsetEncodingOp>(
-            [&](auto op) {
-              // SetEncoding/UnsetEncoding is the bestOp only if there are no
-              // other operations.
-              int64_t estimatedCost = kMinEstimatedCost + 1;
-              if (estimatedCost < bestEstimatedCost)
-                return;
-              bestEstimatedCost = estimatedCost;
-              bestOp = op;
-              LLVM_DEBUG(llvm::dbgs()
-                         << "// new best op: '" << bestOp->getName()
-                         << "', cost: " << bestEstimatedCost << "\n");
-            })
-        .Case<IREE::LinalgExt::LinalgExtOp>([&](auto op) {
-          int64_t estimatedCost = estimateLinalgExtOpCost(op);
-          if (estimatedCost < bestEstimatedCost)
-            return;
-          bestEstimatedCost = estimatedCost;
-          bestOp = op;
-          LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName()
-                                  << "', cost: " << bestEstimatedCost << "\n");
-        })
-        .Default([&](Operation *op) {
-          // No cost estimation implemented, skip.
-        });
-  });
-
-  if (!bestOp) {
-    std::string bestSummary = "";
-    // Check if there is a possible slow memory copy as a dispatch. The current
-    // heuristic is to check if a dispatch.tensor.store stores a tensor that is
-    // directly loaded from a dispatch.tensor.load.
-    regionOp.getWorkgroupBody().walk(
-        [&](IREE::Flow::DispatchTensorStoreOp storeOp) {
-          Value input = storeOp.getValue();
-          if (auto loadOp =
-                  input.getDefiningOp<IREE::Flow::DispatchTensorLoadOp>()) {
-            bestSummary = "slow_memcpy";
-            return WalkResult::interrupt();
-          }
-          return WalkResult::advance();
-        });
-    return bestSummary;
-  }
-
-  std::string bestSummary = "";
-  TypeSwitch<Operation *>(bestOp)
-      .Case<linalg::SoftmaxOp>(
-          [&](auto op) { bestSummary = summarizeLinalgExtOp(op); })
-      .Case<linalg::LinalgOp>(
-          [&](auto op) { bestSummary = summarizeLinalgOp(op); })
-      .Case<IREE::LinalgExt::SetEncodingOp>([&](auto op) {
-        auto opName = getOpNameWithoutDialectName(op);
-        auto encoding = op.getResultType()
-                            .getEncoding()
-                            .template cast<IREE::LinalgExt::EncodingAttr>();
-        auto user = stringifyEnum(encoding.getUser().getValue());
-        auto role = stringifyEnum(encoding.getRole().getValue());
-        ArrayRef<int64_t> shape = op.getSourceType().getShape();
-        bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" +
-                      loopRangesToString(shape);
-        ;
-      })
-      .Case<IREE::LinalgExt::UnsetEncodingOp>([&](auto op) {
-        auto opName = getOpNameWithoutDialectName(op);
-        auto encoding = op.getSourceType()
-                            .getEncoding()
-                            .template cast<IREE::LinalgExt::EncodingAttr>();
-        auto user = stringifyEnum(encoding.getUser().getValue());
-        auto role = stringifyEnum(encoding.getRole().getValue());
-        ArrayRef<int64_t> shape = op.getResultType().getShape();
-        bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" +
-                      loopRangesToString(shape);
-      })
-      .Case<IREE::LinalgExt::LinalgExtOp>(
-          [&](auto op) { bestSummary = summarizeLinalgExtOp(op); })
-      .Default([&](Operation *op) {
-        // No summarization implemented, default to the op's name.
-        bestSummary = op->getName().getStringRef().str();
-      });
-
-  // Sanitize the string so that it contains only C literal-compatible chars.
-  bestSummary = sanitizeSymbolName(bestSummary);
-
-  LLVM_DEBUG(llvm::dbgs() << "// best op summary: '" << bestSummary << "'\n");
-  return bestSummary;
-}
-
 // Creates a flow.executable out of a set of functions, pulling in all other
 // functions reachable by the provided functions.
 static ExecutableOp createExecutable(Location loc, StringRef executableName,
@@ -475,18 +183,8 @@ class OutlineDispatchRegionsPass
       for (int i = 0; i < dispatchWorkgroupsOps.size(); ++i) {
         std::string executableOpName =
             (namePrefix + "_dispatch_" + llvm::Twine(i)).str();
-        // Add a summary of the op as a suffix, if one can be generated.
-        // Note: the executable names omit this suffix so their names are more
-        // predictable.
-        LLVM_DEBUG(llvm::dbgs()
-                   << "//--- summarizing '" << executableOpName << "' ---//\n");
-        std::string opSummary =
-            summarizeDispatchWorkgroupsOp(dispatchWorkgroupsOps[i]);
-        LLVM_DEBUG(llvm::dbgs()
-                   << "//--- opSummary: '" << opSummary << "' ---//\n\n");
-        std::string opSuffix = opSummary.empty() ? "" : "_" + opSummary;
-        std::string exportOpName = executableOpName + opSuffix;
-        if (failed(outlineDispatchWorkgroupsOp(executableOpName, exportOpName,
+        if (failed(outlineDispatchWorkgroupsOp(executableOpName,
+                                               executableOpName,
                                                dispatchWorkgroupsOps[i]))) {
           return signalPassFailure();
         }
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index beece70682d8..2500b4388bdb 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -199,6 +199,11 @@ void buildFlowTransformPassPipeline(OpPassManager &passManager,
   // wrapped in executables.
   passManager.addPass(IREE::Flow::createOutlineDispatchRegionsPass());
 
+  // Annotate executables based on their contents.
+  // This is optional but can provide useful information during compilation and
+  // runtime profiling/tracing.
+  passManager.addPass(IREE::Flow::createAnnotateDispatchesPass());
+
   // Trace/break dispatches by ordinal in the specified region. There is a
   // similar version of the pass run both before and after deduplication
   // depending on if the target is specified by ordinal or by symbol.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
index b8db1df54c13..bbfc494b7452 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h
@@ -203,6 +203,9 @@ std::unique_ptr<Pass> createCaptureDispatchDynamicDimsPass();
 std::unique_ptr<OperationPass<mlir::ModuleOp>>
 createOutlineDispatchRegionsPass();
 
+// Annotates executable dispatches based on their contents.
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createAnnotateDispatchesPass();
+
 // Injects tracing markers for dispatch operation tensor inputs and outputs.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createInjectDispatchTracingPass();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
index 2aa8ef36f600..2591ff803c65 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td
@@ -9,7 +9,14 @@
 
 include "mlir/Pass/PassBase.td"
 
-def CaptureDispatchDynamicDims : Pass<"iree-flow-capture-dispatch-dynamic-dims", ""> {
+def AnnotateDispatches :
+    Pass<"iree-flow-annotate-dispatches", "mlir::ModuleOp"> {
+  let summary = "Annotates executable dispatches based on their contents.";
+  let constructor = "mlir::iree_compiler::IREE::Flow::createAnnotateDispatchesPass()";
+}
+
+def CaptureDispatchDynamicDims :
+    Pass<"iree-flow-capture-dispatch-dynamic-dims", ""> {
   let summary = "Captures dynamic shape dimensions required by dispatch operands/results.";
   let constructor = "mlir::iree_compiler::IREE::Flow::createCaptureDispatchDynamicDimsPass()";
 }
@@ -26,6 +33,12 @@ def CleanupNumericNarrowing :
   let constructor = "mlir::iree_compiler::IREE::Flow::createCleanupNumericNarrowingPass()";
 }
 
+def CloneProducersIntoDispatchRegions :
+    InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> {
+  let summary = "Clone producers into dispatch regions to be isolated above";
+  let constructor = "mlir::iree_compiler::IREE::Flow::createCloneProducersIntoDispatchRegionsPass()";
+}
+
 def CollapseDims :
     Pass<"iree-flow-collapse-dims", ""> {
   let summary = "Collapse reduction dimensions when possible.";
@@ -100,10 +113,10 @@ def FormScalarDispatches :
   let constructor = "mlir::iree_compiler::IREE::Flow::createFormScalarDispatchesPass()";
 }
 
-def CloneProducersIntoDispatchRegions :
-    InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> {
-  let summary = "Clone producers into dispatch regions to be isolated above";
-  let constructor = "mlir::iree_compiler::IREE::Flow::createCloneProducersIntoDispatchRegionsPass()";
+def FuseDequantizationMatmul:
+    Pass<"iree-flow-fuse-dequantization-matmul", ""> {
+  let summary = "Fuse dequantization and matmul linalg.generic ops";
+  let constructor = "mlir::iree_compiler::IREE::Flow::createFuseDequantizationMatmulPass()";
 }
 
 def CollapseDimensions :
@@ -150,6 +163,23 @@ def DispatchWithTransformDialect :
   ];
 }
 
+def DumpDispatchGraph : Pass<"iree-flow-dump-dispatch-graph-pass"> {
+  let summary = "Print visualization of dispatches";
+  let options = [
+    Option<"maxLabelLen", "max-label-len", "unsigned",
+            /*default=*/"20", "Limit attribute/type length to number of chars">,
+    Option<"printAttrs", "print-attrs", "bool",
+           /*default=*/"true", "Print attributes of operations">,
+    Option<"printControlFlowEdges", "print-control-flow-edges", "bool",
+           /*default=*/"false", "Print control flow edges">,
+    Option<"printDataFlowEdges", "print-data-flow-edges", "bool",
+           /*default=*/"true", "Print data flow edges">,
+    Option<"printResultTypes", "print-result-types", "bool",
+            /*default=*/"true", "Print result types of operations">
+  ];
+  let constructor = "mlir::iree_compiler::IREE::Flow::createDumpDispatchGraphPass()";
+}
+
 def EraseUnusedLinalgOperands :
     Pass<"iree-flow-erase-unused-linalg-operands", "mlir::ModuleOp"> {
   let summary = "Erase unused linalg operand and remove dead code.";
@@ -192,7 +222,8 @@ def InferNumericNarrowing :
   let constructor = "mlir::iree_compiler::IREE::Flow::createInferNumericNarrowingPass()";
 }
 
-def InitializeEmptyTensors : Pass<"iree-flow-initialize-empty-tensors", ""> {
+def InitializeEmptyTensors :
+    Pass<"iree-flow-initialize-empty-tensors", ""> {
   let summary = "Initialize empty tensors";
   let options = [
     Option<"zeroFill", "zero-fill", "bool", /*default=*/"false",
@@ -259,40 +290,6 @@ def OutlineDispatchRegions :
   let constructor = "mlir::iree_compiler::IREE::Flow::createOutlineDispatchRegionsPass()";
 }
 
-def SetEncoding : Pass<"iree-flow-set-encoding", ""> {
-  let summary = "Introduce tensor encoding for compute operations";
-  let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()";
-}
-
-def TensorPadToTensorInsertSlice :
-    Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> {
-  let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice";
-  let constructor = "mlir::iree_compiler::IREE::Flow::createTensorPadToTensorInsertSlicePass()";
-  let options = [
-    Option<"optionSkipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool",
-           /*default=*/"false",
-           "Skip the op that has only one use which is used"
-           "by a Linalg op">,
-  ];
-}
-
-def DumpDispatchGraph : Pass<"iree-flow-dump-dispatch-graph-pass"> {
-  let summary = "Print visualization of dispatches";
-  let options = [
-    Option<"maxLabelLen", "max-label-len", "unsigned",
-            /*default=*/"20", "Limit attribute/type length to number of chars">,
-    Option<"printAttrs", "print-attrs", "bool",
-           /*default=*/"true", "Print attributes of operations">,
-    Option<"printControlFlowEdges", "print-control-flow-edges", "bool",
-           /*default=*/"false", "Print control flow edges">,
-    Option<"printDataFlowEdges", "print-data-flow-edges", "bool",
-           /*default=*/"true", "Print data flow edges">,
-    Option<"printResultTypes", "print-result-types", "bool",
-            /*default=*/"true", "Print result types of operations">
-  ];
-  let constructor = "mlir::iree_compiler::IREE::Flow::createDumpDispatchGraphPass()";
-}
-
 def RaiseSpecialOps :
     Pass<"iree-flow-raise-special-ops", ""> {
   let summary = "raise special ops like softmax to the high level linalg.ext representation";
@@ -305,6 +302,11 @@ def RemoveZeroExtentTensors :
   let constructor = "mlir::iree_compiler::IREE::Flow::createRemoveZeroExtentTensorsPass()";
 }
 
+def SetEncoding : Pass<"iree-flow-set-encoding", ""> {
+  let summary = "Introduce tensor encoding for compute operations";
+  let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()";
+}
+
 def SplitReduction :
     Pass<"iree-flow-split-reduction-ops", ""> {
   let summary = "Split reduction dimension to increase parallelism.";
@@ -317,6 +319,18 @@ def StripSignedness :
   let constructor = "mlir::iree_compiler::IREE::Flow::createStripSignednessPass()";
 }
 
+def TensorPadToTensorInsertSlice :
+    Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> {
+  let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice";
+  let constructor = "mlir::iree_compiler::IREE::Flow::createTensorPadToTensorInsertSlicePass()";
+  let options = [
+    Option<"optionSkipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool",
+           /*default=*/"false",
+           "Skip the op that has only one use which is used"
+           "by a Linalg op">,
+  ];
+}
+
 def TopLevelSCFToCFG :
     InterfacePass<"iree-top-level-scf-to-cfg", "mlir::FunctionOpInterface"> {
   let summary = "Converts non-nested SCF constructs to CFG (not traversing into opaque operations).";
@@ -328,10 +342,4 @@ def VerifyInputLegality: Pass<"iree-verify-input-legality", ""> {
   let constructor = "mlir::iree_compiler::IREE::Flow::createVerifyInputLegalityPass()";
 }
 
-def FuseDequantizationMatmul:
-    Pass<"iree-flow-fuse-dequantization-matmul", ""> {
-  let summary = "Fuse dequantization and matmul linalg.generic ops";
-  let constructor = "mlir::iree_compiler::IREE::Flow::createFuseDequantizationMatmulPass()";
-}
-
 #endif  // IREE_DIALECT_FLOW_PASSES
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
index e506ec9fa09f..3ebe0a8e118f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
@@ -16,6 +16,7 @@ iree_lit_test_suite(
     name = "lit",
     srcs = enforce_glob(
         [
+            "annotate_dispatches.mlir",
             "capture_dispatch_dynamic_dims.mlir",
             "cleanup_numeric_narrowing.mlir",
             "cleanup_tensor_shapes.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt
index 9c7242033bfc..ed4b27ae97f1 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt
@@ -14,6 +14,7 @@ iree_lit_test_suite(
   NAME
     lit
   SRCS
+    "annotate_dispatches.mlir"
     "capture_dispatch_dynamic_dims.mlir"
     "cleanup_numeric_narrowing.mlir"
     "cleanup_tensor_shapes.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
new file mode 100644
index 000000000000..184f9d057019
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
@@ -0,0 +1,166 @@
+// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s
+
+// Dispatches containing some ops get a heuristics-driven summary in their name.
+// This also tests symbol reference renaming.
+
+flow.executable private @ex0 {
+  // CHECK: flow.executable.export public @dispatch0_fill_4x8_f32
+  flow.executable.export public @dispatch0
+  builtin.module {
+    // CHECK: func.func @dispatch0_fill_4x8_f32
+    func.func @dispatch0(%arg0: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>) {
+      %cst = arith.constant 1.000000e+02 : f32
+      %0 = tensor.empty() : tensor<4x8xf32>
+      %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x8xf32>) -> tensor<4x8xf32>
+      flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
+      return
+    }
+  }
+}
+flow.executable private @ex1 {
+  // CHECK: flow.executable.export public @dispatch1_fill_8x4_f32
+  flow.executable.export public @dispatch1
+  builtin.module {
+    // CHECK: func.func @dispatch1_fill_8x4_f32
+    func.func @dispatch1(%arg0: !flow.dispatch.tensor<writeonly:tensor<8x4xf32>>) {
+      %cst = arith.constant 2.000000e+02 : f32
+      %0 = tensor.empty() : tensor<8x4xf32>
+      %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<8x4xf32>) -> tensor<8x4xf32>
+      flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0], sizes = [8, 4], strides = [1, 1] : tensor<8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x4xf32>>
+      return
+    }
+  }
+}
+func.func @main() -> (tensor<4x8xf32>, tensor<8x4xf32>) {
+  %c100 = arith.constant 100 : index
+  %c50 = arith.constant 50 : index
+  // CHECK: flow.dispatch @ex0::@dispatch0_fill_4x8_f32
+  %0 = flow.dispatch @ex0::@dispatch0[%c100, %c50]() : () -> tensor<4x8xf32>
+  // CHECK: flow.dispatch @ex1::@dispatch1_fill_8x4_f32
+  %1 = flow.dispatch @ex1::@dispatch1[%c100, %c50]() : () -> tensor<8x4xf32>
+  return %0, %1 : tensor<4x8xf32>, tensor<8x4xf32>
+}
+
+// -----
+
+// A cost model picks the "most expensive" op to include in the summary.
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_fill_40_f32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<writeonly:tensor<10xf32>>) {
+      %cst = arith.constant 1.000000e+02 : f32
+      %0 = tensor.empty() : tensor<10xf32>
+      %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<10xf32>) -> tensor<10xf32>
+      %2 = tensor.empty() : tensor<40xf32>
+      %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<40xf32>) -> tensor<40xf32>
+      %4 = tensor.empty() : tensor<20xf32>
+      %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<20xf32>) -> tensor<20xf32>
+      flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Dynamic dimensions are considered the most expensive.
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_fill_DxDxD_f32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: index, %arg1: !flow.dispatch.tensor<writeonly:tensor<10xf32>>) {
+      %cst = arith.constant 1.000000e+02 : f32
+      %0 = tensor.empty() : tensor<10xf32>
+      %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<10xf32>) -> tensor<10xf32>
+      %2 = tensor.empty(%arg0, %arg0, %arg0) : tensor<?x?x?xf32>
+      %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+      flow.dispatch.tensor.store %1, %arg1, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Dispatch key op with multiple datatypes should be reflected in summary.
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_generic_4x8_i32xf32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>) {
+      %0 = tensor.empty() : tensor<4x8xi32>
+      %1 = tensor.empty() : tensor<4x8xf32>
+      %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4x8xi32>) outs(%1 : tensor<4x8xf32>) {
+      ^bb0(%in: i32, %out: f32):
+        %3 = arith.index_cast %in : i32 to index
+        %extracted = tensor.extract %1[%3, %3] : tensor<4x8xf32>
+        linalg.yield %extracted : f32
+      } -> tensor<4x8xf32>
+      flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
+      return
+    }
+  }
+}
+
+// -----
+
+// Dispatches set_encoding and unset_encoding ops get a heuristics-driven
+// summary in their name.
+
+flow.executable private @ex0 {
+  // CHECK: flow.executable.export public @dispatch0_map_DxD_f32
+  flow.executable.export public @dispatch0
+  builtin.module {
+    func.func @dispatch0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>) {
+      %0 = flow.dispatch.workload.ordinal %arg2, 0 : index
+      %1 = flow.dispatch.workload.ordinal %arg3, 1 : index
+      %2 = flow.dispatch.workload.ordinal %arg4, 2 : index
+      %3 = flow.dispatch.workload.ordinal %arg5, 3 : index
+      %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+      %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+      %6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%2, %3}
+      %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+      %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+      %mapped = linalg.map { math.absf } ins(%7 : tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>)
+      %9 = iree_linalg_ext.set_encoding %mapped : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>
+      flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg4, %arg5}
+      return
+    }
+  }
+}
+flow.executable private @ex1 {
+  // CHECK: flow.executable.export public @dispatch1_unset_encoding_MATMUL_F32F32F32_LHS_DxD
+  flow.executable.export public @dispatch1
+  builtin.module {
+    func.func @dispatch1(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
+      %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg1, %arg2}
+      %1 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg1, %arg2}
+      %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg1, %arg2} -> tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>
+      %3 = iree_linalg_ext.unset_encoding %2 : tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>> -> tensor<?x?xf32>
+      flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg1, %arg2}
+      return
+    }
+  }
+}
+
+// -----
+
+// Named root linalg ops get represented in the dispatch name.
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch_softmax_7xf32
+  flow.executable.export public @dispatch
+  builtin.module {
+    func.func @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<7xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<7xf32>>) {
+      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
+      %1 = tensor.empty() : tensor<7xf32>
+      %2 = linalg.softmax dimension(0) ins(%0 : tensor<7xf32>) outs(%1 : tensor<7xf32>) -> tensor<7xf32>
+      flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
+      return
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
index e9bb837b9d88..446268544e81 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
@@ -176,175 +176,3 @@ func.func @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   }
   return %0 : tensor<4xi32>
 }
-
-// -----
-
-// Dispatches containing some ops get a heuristics-driven summary in their name.
-
-//      CHECK: flow.executable private @main_dispatch_0 {
-// CHECK-NEXT:   flow.executable.export public @main_dispatch_0_fill_4x8
-//      CHECK: func.func @main_dispatch_0_fill_4x8_f32(
-func.func @main() -> tensor<4x8xf32> {
-  %x = arith.constant 100 : index
-  %y = arith.constant 50 : index
-  %0 = flow.dispatch.workgroups[%x, %y]() : () -> (tensor<4x8xf32>) = (
-    %ret: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
-  ) {
-    %cst = arith.constant 100.0 : f32
-    %init = tensor.empty() : tensor<4x8xf32>
-    %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<4x8xf32>) -> tensor<4x8xf32>
-    flow.dispatch.tensor.store %fill, %ret, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
-    flow.return
-  }
-  return %0 : tensor<4x8xf32>
-}
-
-// -----
-
-// A cost model picks the "most expensive" op to include in the summary.
-
-//      CHECK: flow.executable private @main_dispatch_0 {
-// CHECK-NEXT:   flow.executable.export public @main_dispatch_0_fill_40
-//      CHECK: func.func @main_dispatch_0_fill_40_f32(
-func.func @main() -> tensor<10xf32> {
-  %x = arith.constant 100 : index
-  %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<10xf32>) = (
-    %ret: !flow.dispatch.tensor<writeonly:tensor<10xf32>>
-  ) {
-    %cst = arith.constant 100.0 : f32
-    %init_small = tensor.empty() : tensor<10xf32>
-    %fill_small = linalg.fill ins(%cst : f32) outs(%init_small : tensor<10xf32>) -> tensor<10xf32>
-    // Note the ordering here - test that we don't just pick the first or the
-    // last op. If an op in the middle has a higher cost then it should be used.
-    %init_large = tensor.empty() : tensor<40xf32>
-    %fill_large = linalg.fill ins(%cst : f32) outs(%init_large : tensor<40xf32>) -> tensor<40xf32>
-    %init_medium = tensor.empty() : tensor<20xf32>
-    %fill_medium = linalg.fill ins(%cst : f32) outs(%init_medium : tensor<20xf32>) -> tensor<20xf32>
-    flow.dispatch.tensor.store %fill_small, %ret, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
-    flow.return
-  }
-  return %0 : tensor<10xf32>
-}
-
-// -----
-
-// Dynamic dimensions are considered the most expensive.
-
-//      CHECK: flow.executable private @main_dispatch_0 {
-// CHECK-NEXT:   flow.executable.export public @main_dispatch_0_fill_DxDxD
-//      CHECK: func.func @main_dispatch_0_fill_DxDxD_f32(
-func.func @main(%arg0 : index) -> tensor<10xf32> {
-  %x = arith.constant 100 : index
-  %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<10xf32>) = (
-    %arg0: index,
-    %ret: !flow.dispatch.tensor<writeonly:tensor<10xf32>>
-  ) {
-    %cst = arith.constant 100.0 : f32
-    %init_small = tensor.empty() : tensor<10xf32>
-    %fill_small = linalg.fill ins(%cst : f32) outs(%init_small : tensor<10xf32>) -> tensor<10xf32>
-    %init_dynamic = tensor.empty(%arg0, %arg0, %arg0) : tensor<?x?x?xf32>
-    %fill_dynamic = linalg.fill ins(%cst : f32) outs(%init_dynamic : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    flow.dispatch.tensor.store %fill_small, %ret, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
-    flow.return
-  }
-  return %0 : tensor<10xf32>
-}
-
-// -----
-
-// Dispatch key op with multiple datatypes should be reflected in summary.
-
-//      CHECK: flow.executable private @main_dispatch_0 {
-// CHECK-NEXT:   flow.executable.export public @main_dispatch_0_generic_4x8_i32xf32
-//      CHECK: func.func @main_dispatch_0_generic_4x8_i32xf32(
-func.func @main() -> tensor<4x8xf32> {
-  %x = arith.constant 100 : index
-  %y = arith.constant 50 : index
-  %0 = flow.dispatch.workgroups[%x, %y]() : () -> (tensor<4x8xf32>) = (
-    %ret: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
-  ) {
-    %a = tensor.empty() : tensor<4x8xi32>
-    %b = tensor.empty() : tensor<4x8xf32>
-    %ans = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
-        iterator_types = ["parallel", "parallel"]}
-        ins(%a : tensor<4x8xi32>) outs(%b : tensor<4x8xf32>) {
-      ^bb0(%b0 : i32, %b1 : f32):
-        %1 = arith.index_cast %b0 : i32 to index
-        %2 = tensor.extract %b[%1, %1] : tensor<4x8xf32>
-        linalg.yield %2 : f32
-      } -> tensor<4x8xf32>
-    flow.dispatch.tensor.store %ans, %ret, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
-    flow.return
-  }
-  return %0 : tensor<4x8xf32>
-}
-
-// -----
-
-// Dispatches set_encoding and unset_encoding ops get a heuristics-driven
-// summary in their name.
-
-// CHECK: flow.executable private @main_dispatch_0
-// CHECK:   func.func @main_dispatch_0_map_DxD
-// CHECK: flow.executable private @main_dispatch_1
-// CHECK:   func.func @main_dispatch_1_unset_encoding_MATMUL_F32F32F32_LHS_DxD
-func.func @main(%arg0: tensor<?x?xf32>, %arg1: index, %arg2: index, %arg3: tensor<?x?xf32>, %arg4: index, %arg5: index) -> (tensor<?x?xf32>, index, index) {
-  %0 = flow.tensor.tie_shape %arg0 : tensor<?x?xf32>{%arg1, %arg2}
-  %1 = flow.tensor.tie_shape %arg3 : tensor<?x?xf32>{%arg4, %arg5}
-  %2 = flow.dispatch.workgroups[%arg1, %arg2, %arg4, %arg5](%0, %1, %arg1, %arg2, %arg4, %arg5) : (tensor<?x?xf32>{%arg1, %arg2}, tensor<?x?xf32>{%arg4, %arg5}, index, index, index, index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>{%arg4, %arg5} =
-      (%arg6: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg7: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>) {
-    %arg8_0 = flow.dispatch.workload.ordinal %arg8, 0 : index
-    %arg9_0 = flow.dispatch.workload.ordinal %arg9, 1 : index
-    %arg10_0 = flow.dispatch.workload.ordinal %arg10, 2 : index
-    %arg11_0 = flow.dispatch.workload.ordinal %arg11, 3 : index
-    %4 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg8_0, %arg9_0}
-    %5 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg10_0, %arg11_0}
-    %6 = flow.dispatch.tie_shape %arg12 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg10_0, %arg11_0}
-    %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%arg8_0, %arg9_0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg8_0, %arg9_0} -> tensor<?x?xf32>
-    %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%arg10_0, %arg11_0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg10_0, %arg11_0} -> tensor<?x?xf32>
-    %mapped = linalg.map { math.absf } ins(%7 : tensor<?x?xf32>) outs(%8 : tensor<?x?xf32>)
-    %9 = iree_linalg_ext.set_encoding %mapped : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>
-    flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%arg10_0, %arg11_0], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg10, %arg11}
-    flow.return
-  } count(%arg6: index, %arg7: index, %arg8: index, %arg9: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg6, %arg7, %arg8, %arg9
-    flow.return %x, %y, %z : index, index, index
-  }
-  %3 = flow.dispatch.workgroups[%arg4, %arg5](%2, %arg4, %arg5) : (tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>{%arg4, %arg5}, index, index) -> tensor<?x?xf32>{%arg4, %arg5} =
-      (%arg6: !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>, %arg7: index, %arg8: index, %arg9: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
-    %4 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg7, %arg8}
-    %5 = flow.dispatch.tie_shape %arg9 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg7, %arg8}
-    %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>>{%arg7, %arg8} -> tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>>
-    %7 = iree_linalg_ext.unset_encoding %6 : tensor<?x?xf32, #iree_linalg_ext.encoding<user = MATMUL_F32F32F32, role = LHS>> -> tensor<?x?xf32>
-    flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%arg7, %arg8}
-    flow.return
-  } count(%arg6: index, %arg7: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg6, %arg7
-    flow.return %x, %y, %z : index, index, index
-  }
-  return %3, %arg1, %arg2 : tensor<?x?xf32>, index, index
-}
-
-// -----
-
-// iree_linalg_ext ops get a heuristics-driven summary in their name.
-
-//      CHECK: flow.executable private @main_dispatch_0 {
-// CHECK-NEXT:   flow.executable.export public @main_dispatch_0_softmax_7xf32
-//      CHECK: func.func @main_dispatch_0_softmax_7xf32(
-func.func @main(%arg0: tensor<7xf32>) -> tensor<7xf32> {
-  %c7 = arith.constant 7 : index
-  %0 = flow.dispatch.workgroups[%c7](%arg0) : (tensor<7xf32>) -> tensor<7xf32> =
-      (%arg1: !flow.dispatch.tensor<readonly:tensor<7xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<7xf32>>) {
-    %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<7xf32>> -> tensor<7xf32>
-    %2 = tensor.empty() : tensor<7xf32>
-    %3 = linalg.softmax dimension(0) ins(%1 : tensor<7xf32>) outs(%2 : tensor<7xf32>) -> tensor<7xf32>
-    flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
-    flow.return
-  } count(%arg1: index) -> (index, index, index) {
-    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
-    flow.return %x, %y, %z : index, index, index
-  }
-  return %0 : tensor<7xf32>
-}