diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp index a5925f4aeaae..cd2876eb25e8 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp @@ -286,8 +286,6 @@ static void printDispatchWorkgroupsCountRegion(OpAsmPrinter &p, Operation *op, // flow.dispatch.region //===----------------------------------------------------------------------===// -// Verifies the workgroup count - static LogicalResult verifyWorkgroupCountRegion(Operation *op, ValueRange workload, Region ®ion) { // Verify the workload operands match the expected capture args. diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp new file mode 100644 index 000000000000..1d5d4e61e6b3 --- /dev/null +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp @@ -0,0 +1,378 @@ +// Copyright 2019 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h" +#include "iree/compiler/Dialect/Flow/IR/FlowOps.h" +#include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h" +#include "iree/compiler/Dialect/Flow/Transforms/Passes.h" +#include "iree/compiler/Dialect/Util/IR/UtilOps.h" +#include "iree/compiler/Utils/StringUtils.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Pass/Pass.h" + +#define DEBUG_TYPE "iree-dispatch" + +namespace mlir { +namespace iree_compiler { +namespace IREE { +namespace Flow { +namespace { + +static int64_t costOfDomain(ArrayRef domain) { + int64_t product = 1; + for (int64_t size : domain) { + if (size == mlir::ShapedType::kDynamic) + return INT64_MAX; + product *= size; + } + return product; +}; + +// Estimates the evaluation cost of a linalg op using a heuristic cost model. +static int64_t estimateLinalgOpCost(linalg::LinalgOp op) { + // For linalg ops we know the iteration domain, so return the number + // of iterations of the iteration domain (or INT64_MAX for dynamic.) + int64_t cost = costOfDomain(op.getStaticLoopRanges()); + LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost + << "\n"); + return cost; +} + +static TensorType getMainTensorForLinalgExtOp(Operation *op) { + TensorType main; + auto operandTypes = llvm::to_vector(op->getOperandTypes()); + auto resultTypes = llvm::to_vector(op->getResultTypes()); + for (Type t : llvm::concat(operandTypes, resultTypes)) { + auto tensorType = llvm::dyn_cast(t); + if (!tensorType) + continue; + if (!main) { + main = tensorType; + } else if (costOfDomain(tensorType.getShape()) > + costOfDomain(main.getShape())) { + main = tensorType; + } + } + return main; +} + +// Estimates the evaluation cost of a LinalgExt op using a heuristic cost +// model. +static int64_t estimateLinalgExtOpCost(Operation *op) { + TensorType mainTensor = getMainTensorForLinalgExtOp(op); + // Use the cost of the biggest tensor of the LinalgExt op as an approximation. + // This is a very, very coarse approximation. + auto cost = mainTensor ? costOfDomain(mainTensor.getShape()) : 1; + // Multiply by a semi-arbitrarily chosen factor to capture that LinalgExt ops + // are "somewhat more expensive" than simply traversing the main tensor. + // This is something like the extra log(N) factor for a sort or FFT, or + // the amount of work done by a softmax vs a cheap elementwise on a tensor + // of the same shape. + cost *= 10; + LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost + << "\n"); + return cost; +} + +// Estimates the evaluation cost of a Linalg::Softmax op using a heuristic cost +// model similar to LinalgExt ops. +static int64_t estimateLinalgSoftmaxOpCost(Operation *op) { + return estimateLinalgExtOpCost(op); +} + +// Returns a string like "512xDx128" representing loop ranges. +static std::string loopRangesToString(ArrayRef loopRanges) { + std::string outputString; + llvm::raw_string_ostream sstream(outputString); + llvm::interleave( + loopRanges, + [&](int64_t loopRange) { + // Note: normally we'd use '?', but that isn't a valid character for + // function names on a variety of targets, so we stick to [a-Z0-9_] + // characters. + sstream << (ShapedType::isDynamic(loopRange) ? "D" + : llvm::itostr(loopRange)); + }, + [&] { sstream << "x"; }); + return outputString; +} + +static std::string operandTypeToString(Value operandValue) { + auto operandType = operandValue.getType(); + std::string outputString; + llvm::raw_string_ostream sstream(outputString); + if (auto shapedType = dyn_cast(operandType)) { + shapedType.getElementType().print(sstream); + } else { + operandType.print(sstream); + } + return outputString; +} + +// Returns a string like "f32xi32xf16" representing a linalg op's types for each +// operands. Will collapse to single type if all match. +static std::string getLinalgDataTypes(linalg::LinalgOp op) { + std::string firstToken = ""; + bool allTokensSame = true; + SmallVector datatypeTokens; + + for (Value operandValue : op->getOperands()) { + datatypeTokens.push_back(operandTypeToString(operandValue)); + if (firstToken.empty()) { + firstToken = operandTypeToString(operandValue); + } else if (allTokensSame) { + allTokensSame = firstToken == operandTypeToString(operandValue); + } + } + + if (allTokensSame) { + return firstToken; + } else { + std::string outputString; + llvm::raw_string_ostream sstream(outputString); + llvm::interleave( + datatypeTokens, [&](std::string token) { sstream << token; }, + [&] { sstream << "x"; }); + return outputString; + } +} + +/// Returns the op name without dialect name. E.g., it returns "set_encoding" if +/// the input operation is iree_linalg_ext.set_encoding. +static std::string getOpNameWithoutDialectName(Operation *op) { + auto opName = + op->getName().getStringRef().drop_until([](char c) { return c == '.'; }); + if (opName.starts_with(".")) + opName = opName.drop_front(); + return opName.str(); +} + +static std::string summarizeLinalgOp(linalg::LinalgOp op) { + auto opName = op->getName().getStringRef(); + if (!opName.consume_front("linalg.")) + return ""; + std::string opLoopRanges = loopRangesToString(op.getStaticLoopRanges()); + std::string opTypes = opLoopRanges.empty() ? "" : getLinalgDataTypes(op); + return opName.str() + (opLoopRanges.empty() ? "" : "_" + opLoopRanges) + + (opTypes.empty() ? "" : "_" + opTypes); +} + +static std::string summarizeLinalgExtOp(Operation *op) { + auto opName = op->getName().getStringRef(); + // Currently, this utility is also invoked by Linalg::SoftmaxOp. + if (!(opName.consume_front("iree_linalg_ext.") || + opName.consume_front("linalg."))) + return ""; + std::string suffix = ""; + if (TensorType mainTensor = getMainTensorForLinalgExtOp(op)) { + llvm::raw_string_ostream sstream(suffix); + sstream << "_"; + sstream << loopRangesToString(mainTensor.getShape()); + sstream << "x"; + mainTensor.getElementType().print(sstream); + sstream.flush(); + } + return opName.str() + suffix; +} + +// Summarizes the contents of a dispatch into a short string. +// This uses heuristics to aid developer debugging. +static std::string summarizeDispatchRegion(Region ®ion) { + // The goal here is to build a relatively concise description that gives + // enough information to developers to see roughly what sort of computation a + // dispatch region performs. Multiple approaches are valid here, depending on + // what a developer wants to highlight. + // + // Currently, this uses a cost model to estimate which individual operation + // is the most computationally expensive, then a summary is generated which + // includes some of that operation's parameters. + // + // Other metrics to determine which single op is the "best" or which list of + // ops is most interesting (e.g. to highlight large data movements) could be + // used instead. + + Operation *bestOp = NULL; + const int64_t kMinEstimatedCost = -1; + int64_t bestEstimatedCost = kMinEstimatedCost; + region.walk([&](Operation *op) { + TypeSwitch(op) + .Case([&](auto op) { + int64_t estimatedCost = estimateLinalgSoftmaxOpCost(op); + if (estimatedCost < bestEstimatedCost) + return; + bestEstimatedCost = estimatedCost; + bestOp = op; + LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() + << "', cost: " << bestEstimatedCost << "\n"); + }) + .Case([&](auto op) { + int64_t estimatedCost = estimateLinalgOpCost(op); + if (estimatedCost < bestEstimatedCost) + return; + bestEstimatedCost = estimatedCost; + bestOp = op; + LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() + << "', cost: " << bestEstimatedCost << "\n"); + }) + .Case( + [&](auto op) { + // SetEncoding/UnsetEncoding is the bestOp only if there are no + // other operations. + int64_t estimatedCost = kMinEstimatedCost + 1; + if (estimatedCost < bestEstimatedCost) + return; + bestEstimatedCost = estimatedCost; + bestOp = op; + LLVM_DEBUG(llvm::dbgs() + << "// new best op: '" << bestOp->getName() + << "', cost: " << bestEstimatedCost << "\n"); + }) + .Case([&](auto op) { + int64_t estimatedCost = estimateLinalgExtOpCost(op); + if (estimatedCost < bestEstimatedCost) + return; + bestEstimatedCost = estimatedCost; + bestOp = op; + LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() + << "', cost: " << bestEstimatedCost << "\n"); + }) + .Default([&](Operation *op) { + // No cost estimation implemented, skip. + }); + }); + + if (!bestOp) { + std::string bestSummary = ""; + // Check if there is a possible slow memory copy as a dispatch. The current + // heuristic is to check if a dispatch.tensor.store stores a tensor that is + // directly loaded from a dispatch.tensor.load. + region.walk([&](IREE::Flow::DispatchTensorStoreOp storeOp) { + Value input = storeOp.getValue(); + if (auto loadOp = + input.getDefiningOp()) { + bestSummary = "slow_memcpy"; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + return bestSummary; + } + + std::string bestSummary = ""; + TypeSwitch(bestOp) + .Case( + [&](auto op) { bestSummary = summarizeLinalgExtOp(op); }) + .Case( + [&](auto op) { bestSummary = summarizeLinalgOp(op); }) + .Case([&](auto op) { + auto opName = getOpNameWithoutDialectName(op); + auto encoding = op.getResultType() + .getEncoding() + .template cast(); + auto user = stringifyEnum(encoding.getUser().getValue()); + auto role = stringifyEnum(encoding.getRole().getValue()); + ArrayRef shape = op.getSourceType().getShape(); + bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" + + loopRangesToString(shape); + ; + }) + .Case([&](auto op) { + auto opName = getOpNameWithoutDialectName(op); + auto encoding = op.getSourceType() + .getEncoding() + .template cast(); + auto user = stringifyEnum(encoding.getUser().getValue()); + auto role = stringifyEnum(encoding.getRole().getValue()); + ArrayRef shape = op.getResultType().getShape(); + bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" + + loopRangesToString(shape); + }) + .Case( + [&](auto op) { bestSummary = summarizeLinalgExtOp(op); }) + .Default([&](Operation *op) { + // No summarization implemented, default to the op's name. + bestSummary = op->getName().getStringRef().str(); + }); + + // Sanitize the string so that it contains only C literal-compatible chars. + bestSummary = sanitizeSymbolName(bestSummary); + + LLVM_DEBUG(llvm::dbgs() << "// best op summary: '" << bestSummary << "'\n"); + return bestSummary; +} + +} // namespace + +class AnnotateDispatchesPass + : public AnnotateDispatchesBase { +public: + AnnotateDispatchesPass() = default; + + void runOnOperation() override { + DenseMap entryPointRefReplacements; + for (auto executableOp : + getOperation().getBody()->getOps()) { + // Rename each export op. + for (auto exportOp : + executableOp.getBlock().getOps()) { + auto oldSymbolRefAttr = SymbolRefAttr::get( + &getContext(), executableOp.getName(), + {SymbolRefAttr::get(&getContext(), exportOp.getSymName())}); + + auto funcOp = + executableOp.getInnerModule().lookupSymbol( + exportOp.getFunctionRef()); + if (!funcOp) + continue; // extern module, maybe + std::string summary = summarizeDispatchRegion(funcOp.getFunctionBody()); + if (summary.empty()) + continue; // unable to tell + + std::string newName = funcOp.getName().str() + "_" + summary; + + exportOp.setSymName(newName); + exportOp.setFunctionRef(newName); + funcOp.setName(newName); + + auto newSymbolRefAttr = + SymbolRefAttr::get(&getContext(), executableOp.getName(), + {SymbolRefAttr::get(&getContext(), newName)}); + entryPointRefReplacements[oldSymbolRefAttr] = newSymbolRefAttr; + } + } + + // Replace each usage of an entry point with its original symbol name with a + // new symbol name. + for (auto funcLikeOp : getOperation().getOps()) { + funcLikeOp->walk([&](IREE::Flow::DispatchOp dispatchOp) { + auto it = entryPointRefReplacements.find(dispatchOp.getEntryPoint()); + if (it != entryPointRefReplacements.end()) { + dispatchOp.setEntryPointAttr(llvm::cast(it->second)); + } + }); + } + } +}; + +std::unique_ptr> createAnnotateDispatchesPass() { + return std::make_unique(); +} + +} // namespace Flow +} // namespace IREE +} // namespace iree_compiler +} // namespace mlir diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel index a3ad95f80049..1fbf965b3c40 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel @@ -30,6 +30,7 @@ iree_gentbl_cc_library( iree_compiler_cc_library( name = "Transforms", srcs = [ + "AnnotateDispatches.cpp", "CaptureDispatchDynamicDims.cpp", "CleanupNumericNarrowing.cpp", "CleanupTensorShapes.cpp", diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt index c3b86ccf1eef..3d931217c0fa 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt @@ -29,6 +29,7 @@ iree_cc_library( "Passes.h.inc" "RegionOpUtils.h" SRCS + "AnnotateDispatches.cpp" "CaptureDispatchDynamicDims.cpp" "CleanupNumericNarrowing.cpp" "CleanupTensorShapes.cpp" diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp index c472fd9ccf69..f90df6aa4884 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp @@ -6,17 +6,12 @@ #include -#include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h" #include "iree/compiler/Dialect/Flow/IR/FlowOps.h" #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h" #include "iree/compiler/Dialect/Flow/Transforms/Passes.h" #include "iree/compiler/Dialect/Util/IR/UtilOps.h" -#include "iree/compiler/Utils/StringUtils.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" @@ -24,299 +19,12 @@ #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" -#define DEBUG_TYPE "iree-dispatch" - namespace mlir { namespace iree_compiler { namespace IREE { namespace Flow { namespace { -static int64_t costOfDomain(ArrayRef domain) { - int64_t product = 1; - for (int64_t size : domain) { - if (size == mlir::ShapedType::kDynamic) - return INT64_MAX; - product *= size; - } - return product; -}; - -// Estimates the evaluation cost of a linalg op using a heuristic cost model. -static int64_t estimateLinalgOpCost(linalg::LinalgOp op) { - // For linalg ops we know the iteration domain, so return the number - // of iterations of the iteration domain (or INT64_MAX for dynamic.) - int64_t cost = costOfDomain(op.getStaticLoopRanges()); - LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost - << "\n"); - return cost; -} - -static TensorType getMainTensorForLinalgExtOp(Operation *op) { - TensorType main; - auto operandTypes = llvm::to_vector(op->getOperandTypes()); - auto resultTypes = llvm::to_vector(op->getResultTypes()); - for (Type t : llvm::concat(operandTypes, resultTypes)) { - auto tensorType = llvm::dyn_cast(t); - if (!tensorType) - continue; - if (!main) { - main = tensorType; - } else if (costOfDomain(tensorType.getShape()) > - costOfDomain(main.getShape())) { - main = tensorType; - } - } - return main; -} - -// Estimates the evaluation cost of a LinalgExt op using a heuristic cost -// model. -static int64_t estimateLinalgExtOpCost(Operation *op) { - TensorType mainTensor = getMainTensorForLinalgExtOp(op); - // Use the cost of the biggest tensor of the LinalgExt op as an approximation. - // This is a very, very coarse approximation. - auto cost = mainTensor ? costOfDomain(mainTensor.getShape()) : 1; - // Multiply by a semi-arbitrarily chosen factor to capture that LinalgExt ops - // are "somewhat more expensive" than simply traversing the main tensor. - // This is something like the extra log(N) factor for a sort or FFT, or - // the amount of work done by a softmax vs a cheap elementwise on a tensor - // of the same shape. - cost *= 10; - LLVM_DEBUG(llvm::dbgs() << "// " << op->getName() << " cost: " << cost - << "\n"); - return cost; -} - -// Estimates the evaluation cost of a Linalg::Softmax op using a heuristic cost -// model similar to LinalgExt ops. -static int64_t estimateLinalgSoftmaxOpCost(Operation *op) { - return estimateLinalgExtOpCost(op); -} - -// Returns a string like "512xDx128" representing loop ranges. -static std::string loopRangesToString(ArrayRef loopRanges) { - std::string outputString; - llvm::raw_string_ostream sstream(outputString); - llvm::interleave( - loopRanges, - [&](int64_t loopRange) { - // Note: normally we'd use '?', but that isn't a valid character for - // function names on a variety of targets, so we stick to [a-Z0-9_] - // characters. - sstream << (ShapedType::isDynamic(loopRange) ? "D" - : llvm::itostr(loopRange)); - }, - [&] { sstream << "x"; }); - return outputString; -} - -static std::string operandTypeToString(Value operandValue) { - auto operandType = operandValue.getType(); - std::string outputString; - llvm::raw_string_ostream sstream(outputString); - if (auto shapedType = dyn_cast(operandType)) { - shapedType.getElementType().print(sstream); - } else { - operandType.print(sstream); - } - return outputString; -} - -// Returns a string like "f32xi32xf16" representing a linalg op's types for each -// operands. Will collapse to single type if all match. -static std::string getLinalgDataTypes(linalg::LinalgOp op) { - std::string firstToken = ""; - bool allTokensSame = true; - SmallVector datatypeTokens; - - for (Value operandValue : op->getOperands()) { - datatypeTokens.push_back(operandTypeToString(operandValue)); - if (firstToken.empty()) { - firstToken = operandTypeToString(operandValue); - } else if (allTokensSame) { - allTokensSame = firstToken == operandTypeToString(operandValue); - } - } - - if (allTokensSame) { - return firstToken; - } else { - std::string outputString; - llvm::raw_string_ostream sstream(outputString); - llvm::interleave( - datatypeTokens, [&](std::string token) { sstream << token; }, - [&] { sstream << "x"; }); - return outputString; - } -} - -/// Returns the op name without dialect name. E.g., it returns "set_encoding" if -/// the input operation is iree_linalg_ext.set_encoding. -static std::string getOpNameWithoutDialectName(Operation *op) { - auto opName = - op->getName().getStringRef().drop_until([](char c) { return c == '.'; }); - if (opName.starts_with(".")) - opName = opName.drop_front(); - return opName.str(); -} - -static std::string summarizeLinalgOp(linalg::LinalgOp op) { - auto opName = op->getName().getStringRef(); - if (!opName.consume_front("linalg.")) - return ""; - std::string opLoopRanges = loopRangesToString(op.getStaticLoopRanges()); - std::string opTypes = opLoopRanges.empty() ? "" : getLinalgDataTypes(op); - return opName.str() + (opLoopRanges.empty() ? "" : "_" + opLoopRanges) + - (opTypes.empty() ? "" : "_" + opTypes); -} - -static std::string summarizeLinalgExtOp(Operation *op) { - auto opName = op->getName().getStringRef(); - // Currently, this utility is also invoked by Linalg::SoftmaxOp. - if (!(opName.consume_front("iree_linalg_ext.") || - opName.consume_front("linalg."))) - return ""; - std::string suffix = ""; - if (TensorType mainTensor = getMainTensorForLinalgExtOp(op)) { - llvm::raw_string_ostream sstream(suffix); - sstream << "_"; - sstream << loopRangesToString(mainTensor.getShape()); - sstream << "x"; - mainTensor.getElementType().print(sstream); - sstream.flush(); - } - return opName.str() + suffix; -} - -// Summarizes the contents of a dispatch into a short string. -// This uses heuristics to aid developer debugging. -static std::string -summarizeDispatchWorkgroupsOp(DispatchWorkgroupsOp regionOp) { - // The goal here is to build a relatively concise description that gives - // enough information to developers to see roughly what sort of computation a - // dispatch region performs. Multiple approaches are valid here, depending on - // what a developer wants to highlight. - // - // Currently, this uses a cost model to estimate which individual operation - // is the most computationally expensive, then a summary is generated which - // includes some of that operation's parameters. - // - // Other metrics to determine which single op is the "best" or which list of - // ops is most interesting (e.g. to highlight large data movements) could be - // used instead. - - Operation *bestOp = NULL; - const int64_t kMinEstimatedCost = -1; - int64_t bestEstimatedCost = kMinEstimatedCost; - regionOp.getWorkgroupBody().walk([&](Operation *op) { - TypeSwitch(op) - .Case([&](auto op) { - int64_t estimatedCost = estimateLinalgSoftmaxOpCost(op); - if (estimatedCost < bestEstimatedCost) - return; - bestEstimatedCost = estimatedCost; - bestOp = op; - LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() - << "', cost: " << bestEstimatedCost << "\n"); - }) - .Case([&](auto op) { - int64_t estimatedCost = estimateLinalgOpCost(op); - if (estimatedCost < bestEstimatedCost) - return; - bestEstimatedCost = estimatedCost; - bestOp = op; - LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() - << "', cost: " << bestEstimatedCost << "\n"); - }) - .Case( - [&](auto op) { - // SetEncoding/UnsetEncoding is the bestOp only if there are no - // other operations. - int64_t estimatedCost = kMinEstimatedCost + 1; - if (estimatedCost < bestEstimatedCost) - return; - bestEstimatedCost = estimatedCost; - bestOp = op; - LLVM_DEBUG(llvm::dbgs() - << "// new best op: '" << bestOp->getName() - << "', cost: " << bestEstimatedCost << "\n"); - }) - .Case([&](auto op) { - int64_t estimatedCost = estimateLinalgExtOpCost(op); - if (estimatedCost < bestEstimatedCost) - return; - bestEstimatedCost = estimatedCost; - bestOp = op; - LLVM_DEBUG(llvm::dbgs() << "// new best op: '" << bestOp->getName() - << "', cost: " << bestEstimatedCost << "\n"); - }) - .Default([&](Operation *op) { - // No cost estimation implemented, skip. - }); - }); - - if (!bestOp) { - std::string bestSummary = ""; - // Check if there is a possible slow memory copy as a dispatch. The current - // heuristic is to check if a dispatch.tensor.store stores a tensor that is - // directly loaded from a dispatch.tensor.load. - regionOp.getWorkgroupBody().walk( - [&](IREE::Flow::DispatchTensorStoreOp storeOp) { - Value input = storeOp.getValue(); - if (auto loadOp = - input.getDefiningOp()) { - bestSummary = "slow_memcpy"; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - return bestSummary; - } - - std::string bestSummary = ""; - TypeSwitch(bestOp) - .Case( - [&](auto op) { bestSummary = summarizeLinalgExtOp(op); }) - .Case( - [&](auto op) { bestSummary = summarizeLinalgOp(op); }) - .Case([&](auto op) { - auto opName = getOpNameWithoutDialectName(op); - auto encoding = op.getResultType() - .getEncoding() - .template cast(); - auto user = stringifyEnum(encoding.getUser().getValue()); - auto role = stringifyEnum(encoding.getRole().getValue()); - ArrayRef shape = op.getSourceType().getShape(); - bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" + - loopRangesToString(shape); - ; - }) - .Case([&](auto op) { - auto opName = getOpNameWithoutDialectName(op); - auto encoding = op.getSourceType() - .getEncoding() - .template cast(); - auto user = stringifyEnum(encoding.getUser().getValue()); - auto role = stringifyEnum(encoding.getRole().getValue()); - ArrayRef shape = op.getResultType().getShape(); - bestSummary = opName + "_" + user.str() + "_" + role.str() + "_" + - loopRangesToString(shape); - }) - .Case( - [&](auto op) { bestSummary = summarizeLinalgExtOp(op); }) - .Default([&](Operation *op) { - // No summarization implemented, default to the op's name. - bestSummary = op->getName().getStringRef().str(); - }); - - // Sanitize the string so that it contains only C literal-compatible chars. - bestSummary = sanitizeSymbolName(bestSummary); - - LLVM_DEBUG(llvm::dbgs() << "// best op summary: '" << bestSummary << "'\n"); - return bestSummary; -} - // Creates a flow.executable out of a set of functions, pulling in all other // functions reachable by the provided functions. static ExecutableOp createExecutable(Location loc, StringRef executableName, @@ -475,18 +183,8 @@ class OutlineDispatchRegionsPass for (int i = 0; i < dispatchWorkgroupsOps.size(); ++i) { std::string executableOpName = (namePrefix + "_dispatch_" + llvm::Twine(i)).str(); - // Add a summary of the op as a suffix, if one can be generated. - // Note: the executable names omit this suffix so their names are more - // predictable. - LLVM_DEBUG(llvm::dbgs() - << "//--- summarizing '" << executableOpName << "' ---//\n"); - std::string opSummary = - summarizeDispatchWorkgroupsOp(dispatchWorkgroupsOps[i]); - LLVM_DEBUG(llvm::dbgs() - << "//--- opSummary: '" << opSummary << "' ---//\n\n"); - std::string opSuffix = opSummary.empty() ? "" : "_" + opSummary; - std::string exportOpName = executableOpName + opSuffix; - if (failed(outlineDispatchWorkgroupsOp(executableOpName, exportOpName, + if (failed(outlineDispatchWorkgroupsOp(executableOpName, + executableOpName, dispatchWorkgroupsOps[i]))) { return signalPassFailure(); } diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp index beece70682d8..2500b4388bdb 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp @@ -199,6 +199,11 @@ void buildFlowTransformPassPipeline(OpPassManager &passManager, // wrapped in executables. passManager.addPass(IREE::Flow::createOutlineDispatchRegionsPass()); + // Annotate executables based on their contents. + // This is optional but can provide useful information during compilation and + // runtime profiling/tracing. + passManager.addPass(IREE::Flow::createAnnotateDispatchesPass()); + // Trace/break dispatches by ordinal in the specified region. There is a // similar version of the pass run both before and after deduplication // depending on if the target is specified by ordinal or by symbol. diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h index b8db1df54c13..bbfc494b7452 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.h @@ -203,6 +203,9 @@ std::unique_ptr createCaptureDispatchDynamicDimsPass(); std::unique_ptr> createOutlineDispatchRegionsPass(); +// Annotates executable dispatches based on their contents. +std::unique_ptr> createAnnotateDispatchesPass(); + // Injects tracing markers for dispatch operation tensor inputs and outputs. std::unique_ptr> createInjectDispatchTracingPass(); diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td index 2aa8ef36f600..2591ff803c65 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td @@ -9,7 +9,14 @@ include "mlir/Pass/PassBase.td" -def CaptureDispatchDynamicDims : Pass<"iree-flow-capture-dispatch-dynamic-dims", ""> { +def AnnotateDispatches : + Pass<"iree-flow-annotate-dispatches", "mlir::ModuleOp"> { + let summary = "Annotates executable dispatches based on their contents."; + let constructor = "mlir::iree_compiler::IREE::Flow::createAnnotateDispatchesPass()"; +} + +def CaptureDispatchDynamicDims : + Pass<"iree-flow-capture-dispatch-dynamic-dims", ""> { let summary = "Captures dynamic shape dimensions required by dispatch operands/results."; let constructor = "mlir::iree_compiler::IREE::Flow::createCaptureDispatchDynamicDimsPass()"; } @@ -26,6 +33,12 @@ def CleanupNumericNarrowing : let constructor = "mlir::iree_compiler::IREE::Flow::createCleanupNumericNarrowingPass()"; } +def CloneProducersIntoDispatchRegions : + InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> { + let summary = "Clone producers into dispatch regions to be isolated above"; + let constructor = "mlir::iree_compiler::IREE::Flow::createCloneProducersIntoDispatchRegionsPass()"; +} + def CollapseDims : Pass<"iree-flow-collapse-dims", ""> { let summary = "Collapse reduction dimensions when possible."; @@ -100,10 +113,10 @@ def FormScalarDispatches : let constructor = "mlir::iree_compiler::IREE::Flow::createFormScalarDispatchesPass()"; } -def CloneProducersIntoDispatchRegions : - InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> { - let summary = "Clone producers into dispatch regions to be isolated above"; - let constructor = "mlir::iree_compiler::IREE::Flow::createCloneProducersIntoDispatchRegionsPass()"; +def FuseDequantizationMatmul: + Pass<"iree-flow-fuse-dequantization-matmul", ""> { + let summary = "Fuse dequantization and matmul linalg.generic ops"; + let constructor = "mlir::iree_compiler::IREE::Flow::createFuseDequantizationMatmulPass()"; } def CollapseDimensions : @@ -150,6 +163,23 @@ def DispatchWithTransformDialect : ]; } +def DumpDispatchGraph : Pass<"iree-flow-dump-dispatch-graph-pass"> { + let summary = "Print visualization of dispatches"; + let options = [ + Option<"maxLabelLen", "max-label-len", "unsigned", + /*default=*/"20", "Limit attribute/type length to number of chars">, + Option<"printAttrs", "print-attrs", "bool", + /*default=*/"true", "Print attributes of operations">, + Option<"printControlFlowEdges", "print-control-flow-edges", "bool", + /*default=*/"false", "Print control flow edges">, + Option<"printDataFlowEdges", "print-data-flow-edges", "bool", + /*default=*/"true", "Print data flow edges">, + Option<"printResultTypes", "print-result-types", "bool", + /*default=*/"true", "Print result types of operations"> + ]; + let constructor = "mlir::iree_compiler::IREE::Flow::createDumpDispatchGraphPass()"; +} + def EraseUnusedLinalgOperands : Pass<"iree-flow-erase-unused-linalg-operands", "mlir::ModuleOp"> { let summary = "Erase unused linalg operand and remove dead code."; @@ -192,7 +222,8 @@ def InferNumericNarrowing : let constructor = "mlir::iree_compiler::IREE::Flow::createInferNumericNarrowingPass()"; } -def InitializeEmptyTensors : Pass<"iree-flow-initialize-empty-tensors", ""> { +def InitializeEmptyTensors : + Pass<"iree-flow-initialize-empty-tensors", ""> { let summary = "Initialize empty tensors"; let options = [ Option<"zeroFill", "zero-fill", "bool", /*default=*/"false", @@ -259,40 +290,6 @@ def OutlineDispatchRegions : let constructor = "mlir::iree_compiler::IREE::Flow::createOutlineDispatchRegionsPass()"; } -def SetEncoding : Pass<"iree-flow-set-encoding", ""> { - let summary = "Introduce tensor encoding for compute operations"; - let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()"; -} - -def TensorPadToTensorInsertSlice : - Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> { - let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice"; - let constructor = "mlir::iree_compiler::IREE::Flow::createTensorPadToTensorInsertSlicePass()"; - let options = [ - Option<"optionSkipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool", - /*default=*/"false", - "Skip the op that has only one use which is used" - "by a Linalg op">, - ]; -} - -def DumpDispatchGraph : Pass<"iree-flow-dump-dispatch-graph-pass"> { - let summary = "Print visualization of dispatches"; - let options = [ - Option<"maxLabelLen", "max-label-len", "unsigned", - /*default=*/"20", "Limit attribute/type length to number of chars">, - Option<"printAttrs", "print-attrs", "bool", - /*default=*/"true", "Print attributes of operations">, - Option<"printControlFlowEdges", "print-control-flow-edges", "bool", - /*default=*/"false", "Print control flow edges">, - Option<"printDataFlowEdges", "print-data-flow-edges", "bool", - /*default=*/"true", "Print data flow edges">, - Option<"printResultTypes", "print-result-types", "bool", - /*default=*/"true", "Print result types of operations"> - ]; - let constructor = "mlir::iree_compiler::IREE::Flow::createDumpDispatchGraphPass()"; -} - def RaiseSpecialOps : Pass<"iree-flow-raise-special-ops", ""> { let summary = "raise special ops like softmax to the high level linalg.ext representation"; @@ -305,6 +302,11 @@ def RemoveZeroExtentTensors : let constructor = "mlir::iree_compiler::IREE::Flow::createRemoveZeroExtentTensorsPass()"; } +def SetEncoding : Pass<"iree-flow-set-encoding", ""> { + let summary = "Introduce tensor encoding for compute operations"; + let constructor = "mlir::iree_compiler::IREE::Flow::createSetEncodingPass()"; +} + def SplitReduction : Pass<"iree-flow-split-reduction-ops", ""> { let summary = "Split reduction dimension to increase parallelism."; @@ -317,6 +319,18 @@ def StripSignedness : let constructor = "mlir::iree_compiler::IREE::Flow::createStripSignednessPass()"; } +def TensorPadToTensorInsertSlice : + Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> { + let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice"; + let constructor = "mlir::iree_compiler::IREE::Flow::createTensorPadToTensorInsertSlicePass()"; + let options = [ + Option<"optionSkipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool", + /*default=*/"false", + "Skip the op that has only one use which is used" + "by a Linalg op">, + ]; +} + def TopLevelSCFToCFG : InterfacePass<"iree-top-level-scf-to-cfg", "mlir::FunctionOpInterface"> { let summary = "Converts non-nested SCF constructs to CFG (not traversing into opaque operations)."; @@ -328,10 +342,4 @@ def VerifyInputLegality: Pass<"iree-verify-input-legality", ""> { let constructor = "mlir::iree_compiler::IREE::Flow::createVerifyInputLegalityPass()"; } -def FuseDequantizationMatmul: - Pass<"iree-flow-fuse-dequantization-matmul", ""> { - let summary = "Fuse dequantization and matmul linalg.generic ops"; - let constructor = "mlir::iree_compiler::IREE::Flow::createFuseDequantizationMatmulPass()"; -} - #endif // IREE_DIALECT_FLOW_PASSES diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel index e506ec9fa09f..3ebe0a8e118f 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel @@ -16,6 +16,7 @@ iree_lit_test_suite( name = "lit", srcs = enforce_glob( [ + "annotate_dispatches.mlir", "capture_dispatch_dynamic_dims.mlir", "cleanup_numeric_narrowing.mlir", "cleanup_tensor_shapes.mlir", diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt index 9c7242033bfc..ed4b27ae97f1 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt @@ -14,6 +14,7 @@ iree_lit_test_suite( NAME lit SRCS + "annotate_dispatches.mlir" "capture_dispatch_dynamic_dims.mlir" "cleanup_numeric_narrowing.mlir" "cleanup_tensor_shapes.mlir" diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir new file mode 100644 index 000000000000..184f9d057019 --- /dev/null +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir @@ -0,0 +1,166 @@ +// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-annotate-dispatches %s | FileCheck %s + +// Dispatches containing some ops get a heuristics-driven summary in their name. +// This also tests symbol reference renaming. + +flow.executable private @ex0 { + // CHECK: flow.executable.export public @dispatch0_fill_4x8_f32 + flow.executable.export public @dispatch0 + builtin.module { + // CHECK: func.func @dispatch0_fill_4x8_f32 + func.func @dispatch0(%arg0: !flow.dispatch.tensor>) { + %cst = arith.constant 1.000000e+02 : f32 + %0 = tensor.empty() : tensor<4x8xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x8xf32>) -> tensor<4x8xf32> + flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor> + return + } + } +} +flow.executable private @ex1 { + // CHECK: flow.executable.export public @dispatch1_fill_8x4_f32 + flow.executable.export public @dispatch1 + builtin.module { + // CHECK: func.func @dispatch1_fill_8x4_f32 + func.func @dispatch1(%arg0: !flow.dispatch.tensor>) { + %cst = arith.constant 2.000000e+02 : f32 + %0 = tensor.empty() : tensor<8x4xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<8x4xf32>) -> tensor<8x4xf32> + flow.dispatch.tensor.store %1, %arg0, offsets = [0, 0], sizes = [8, 4], strides = [1, 1] : tensor<8x4xf32> -> !flow.dispatch.tensor> + return + } + } +} +func.func @main() -> (tensor<4x8xf32>, tensor<8x4xf32>) { + %c100 = arith.constant 100 : index + %c50 = arith.constant 50 : index + // CHECK: flow.dispatch @ex0::@dispatch0_fill_4x8_f32 + %0 = flow.dispatch @ex0::@dispatch0[%c100, %c50]() : () -> tensor<4x8xf32> + // CHECK: flow.dispatch @ex1::@dispatch1_fill_8x4_f32 + %1 = flow.dispatch @ex1::@dispatch1[%c100, %c50]() : () -> tensor<8x4xf32> + return %0, %1 : tensor<4x8xf32>, tensor<8x4xf32> +} + +// ----- + +// A cost model picks the "most expensive" op to include in the summary. + +flow.executable private @ex { + // CHECK: flow.executable.export public @dispatch_fill_40_f32 + flow.executable.export public @dispatch + builtin.module { + func.func @dispatch(%arg0: !flow.dispatch.tensor>) { + %cst = arith.constant 1.000000e+02 : f32 + %0 = tensor.empty() : tensor<10xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<10xf32>) -> tensor<10xf32> + %2 = tensor.empty() : tensor<40xf32> + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<40xf32>) -> tensor<40xf32> + %4 = tensor.empty() : tensor<20xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<20xf32>) -> tensor<20xf32> + flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> + return + } + } +} + +// ----- + +// Dynamic dimensions are considered the most expensive. + +flow.executable private @ex { + // CHECK: flow.executable.export public @dispatch_fill_DxDxD_f32 + flow.executable.export public @dispatch + builtin.module { + func.func @dispatch(%arg0: index, %arg1: !flow.dispatch.tensor>) { + %cst = arith.constant 1.000000e+02 : f32 + %0 = tensor.empty() : tensor<10xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<10xf32>) -> tensor<10xf32> + %2 = tensor.empty(%arg0, %arg0, %arg0) : tensor + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor) -> tensor + flow.dispatch.tensor.store %1, %arg1, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> + return + } + } +} + +// ----- + +// Dispatch key op with multiple datatypes should be reflected in summary. + +flow.executable private @ex { + // CHECK: flow.executable.export public @dispatch_generic_4x8_i32xf32 + flow.executable.export public @dispatch + builtin.module { + func.func @dispatch(%arg0: !flow.dispatch.tensor>) { + %0 = tensor.empty() : tensor<4x8xi32> + %1 = tensor.empty() : tensor<4x8xf32> + %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<4x8xi32>) outs(%1 : tensor<4x8xf32>) { + ^bb0(%in: i32, %out: f32): + %3 = arith.index_cast %in : i32 to index + %extracted = tensor.extract %1[%3, %3] : tensor<4x8xf32> + linalg.yield %extracted : f32 + } -> tensor<4x8xf32> + flow.dispatch.tensor.store %2, %arg0, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor> + return + } + } +} + +// ----- + +// Dispatches set_encoding and unset_encoding ops get a heuristics-driven +// summary in their name. + +flow.executable private @ex0 { + // CHECK: flow.executable.export public @dispatch0_map_DxD_f32 + flow.executable.export public @dispatch0 + builtin.module { + func.func @dispatch0(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor>>) { + %0 = flow.dispatch.workload.ordinal %arg2, 0 : index + %1 = flow.dispatch.workload.ordinal %arg3, 1 : index + %2 = flow.dispatch.workload.ordinal %arg4, 2 : index + %3 = flow.dispatch.workload.ordinal %arg5, 3 : index + %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%0, %1} + %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor>{%2, %3} + %6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor>>{%2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %mapped = linalg.map { math.absf } ins(%7 : tensor) outs(%8 : tensor) + %9 = iree_linalg_ext.set_encoding %mapped : tensor -> tensor> + flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor> -> !flow.dispatch.tensor>>{%arg4, %arg5} + return + } + } +} +flow.executable private @ex1 { + // CHECK: flow.executable.export public @dispatch1_unset_encoding_MATMUL_F32F32F32_LHS_DxD + flow.executable.export public @dispatch1 + builtin.module { + func.func @dispatch1(%arg0: !flow.dispatch.tensor>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor>) { + %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>>{%arg1, %arg2} + %1 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor>{%arg1, %arg2} + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : !flow.dispatch.tensor>>{%arg1, %arg2} -> tensor> + %3 = iree_linalg_ext.unset_encoding %2 : tensor> -> tensor + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%arg1, %arg2} + return + } + } +} + +// ----- + +// Named root linalg ops get represented in the dispatch name. + +flow.executable private @ex { + // CHECK: flow.executable.export public @dispatch_softmax_7xf32 + flow.executable.export public @dispatch + builtin.module { + func.func @dispatch(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>) { + %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor> -> tensor<7xf32> + %1 = tensor.empty() : tensor<7xf32> + %2 = linalg.softmax dimension(0) ins(%0 : tensor<7xf32>) outs(%1 : tensor<7xf32>) -> tensor<7xf32> + flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor> + return + } + } +} diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir index e9bb837b9d88..446268544e81 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir @@ -176,175 +176,3 @@ func.func @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> { } return %0 : tensor<4xi32> } - -// ----- - -// Dispatches containing some ops get a heuristics-driven summary in their name. - -// CHECK: flow.executable private @main_dispatch_0 { -// CHECK-NEXT: flow.executable.export public @main_dispatch_0_fill_4x8 -// CHECK: func.func @main_dispatch_0_fill_4x8_f32( -func.func @main() -> tensor<4x8xf32> { - %x = arith.constant 100 : index - %y = arith.constant 50 : index - %0 = flow.dispatch.workgroups[%x, %y]() : () -> (tensor<4x8xf32>) = ( - %ret: !flow.dispatch.tensor> - ) { - %cst = arith.constant 100.0 : f32 - %init = tensor.empty() : tensor<4x8xf32> - %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<4x8xf32>) -> tensor<4x8xf32> - flow.dispatch.tensor.store %fill, %ret, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor> - flow.return - } - return %0 : tensor<4x8xf32> -} - -// ----- - -// A cost model picks the "most expensive" op to include in the summary. - -// CHECK: flow.executable private @main_dispatch_0 { -// CHECK-NEXT: flow.executable.export public @main_dispatch_0_fill_40 -// CHECK: func.func @main_dispatch_0_fill_40_f32( -func.func @main() -> tensor<10xf32> { - %x = arith.constant 100 : index - %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<10xf32>) = ( - %ret: !flow.dispatch.tensor> - ) { - %cst = arith.constant 100.0 : f32 - %init_small = tensor.empty() : tensor<10xf32> - %fill_small = linalg.fill ins(%cst : f32) outs(%init_small : tensor<10xf32>) -> tensor<10xf32> - // Note the ordering here - test that we don't just pick the first or the - // last op. If an op in the middle has a higher cost then it should be used. - %init_large = tensor.empty() : tensor<40xf32> - %fill_large = linalg.fill ins(%cst : f32) outs(%init_large : tensor<40xf32>) -> tensor<40xf32> - %init_medium = tensor.empty() : tensor<20xf32> - %fill_medium = linalg.fill ins(%cst : f32) outs(%init_medium : tensor<20xf32>) -> tensor<20xf32> - flow.dispatch.tensor.store %fill_small, %ret, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> - flow.return - } - return %0 : tensor<10xf32> -} - -// ----- - -// Dynamic dimensions are considered the most expensive. - -// CHECK: flow.executable private @main_dispatch_0 { -// CHECK-NEXT: flow.executable.export public @main_dispatch_0_fill_DxDxD -// CHECK: func.func @main_dispatch_0_fill_DxDxD_f32( -func.func @main(%arg0 : index) -> tensor<10xf32> { - %x = arith.constant 100 : index - %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<10xf32>) = ( - %arg0: index, - %ret: !flow.dispatch.tensor> - ) { - %cst = arith.constant 100.0 : f32 - %init_small = tensor.empty() : tensor<10xf32> - %fill_small = linalg.fill ins(%cst : f32) outs(%init_small : tensor<10xf32>) -> tensor<10xf32> - %init_dynamic = tensor.empty(%arg0, %arg0, %arg0) : tensor - %fill_dynamic = linalg.fill ins(%cst : f32) outs(%init_dynamic : tensor) -> tensor - flow.dispatch.tensor.store %fill_small, %ret, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> - flow.return - } - return %0 : tensor<10xf32> -} - -// ----- - -// Dispatch key op with multiple datatypes should be reflected in summary. - -// CHECK: flow.executable private @main_dispatch_0 { -// CHECK-NEXT: flow.executable.export public @main_dispatch_0_generic_4x8_i32xf32 -// CHECK: func.func @main_dispatch_0_generic_4x8_i32xf32( -func.func @main() -> tensor<4x8xf32> { - %x = arith.constant 100 : index - %y = arith.constant 50 : index - %0 = flow.dispatch.workgroups[%x, %y]() : () -> (tensor<4x8xf32>) = ( - %ret: !flow.dispatch.tensor> - ) { - %a = tensor.empty() : tensor<4x8xi32> - %b = tensor.empty() : tensor<4x8xf32> - %ans = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%a : tensor<4x8xi32>) outs(%b : tensor<4x8xf32>) { - ^bb0(%b0 : i32, %b1 : f32): - %1 = arith.index_cast %b0 : i32 to index - %2 = tensor.extract %b[%1, %1] : tensor<4x8xf32> - linalg.yield %2 : f32 - } -> tensor<4x8xf32> - flow.dispatch.tensor.store %ans, %ret, offsets = [0, 0], sizes = [4, 8], strides = [1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor> - flow.return - } - return %0 : tensor<4x8xf32> -} - -// ----- - -// Dispatches set_encoding and unset_encoding ops get a heuristics-driven -// summary in their name. - -// CHECK: flow.executable private @main_dispatch_0 -// CHECK: func.func @main_dispatch_0_map_DxD -// CHECK: flow.executable private @main_dispatch_1 -// CHECK: func.func @main_dispatch_1_unset_encoding_MATMUL_F32F32F32_LHS_DxD -func.func @main(%arg0: tensor, %arg1: index, %arg2: index, %arg3: tensor, %arg4: index, %arg5: index) -> (tensor, index, index) { - %0 = flow.tensor.tie_shape %arg0 : tensor{%arg1, %arg2} - %1 = flow.tensor.tie_shape %arg3 : tensor{%arg4, %arg5} - %2 = flow.dispatch.workgroups[%arg1, %arg2, %arg4, %arg5](%0, %1, %arg1, %arg2, %arg4, %arg5) : (tensor{%arg1, %arg2}, tensor{%arg4, %arg5}, index, index, index, index) -> tensor>{%arg4, %arg5} = - (%arg6: !flow.dispatch.tensor>, %arg7: !flow.dispatch.tensor>, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: !flow.dispatch.tensor>>) { - %arg8_0 = flow.dispatch.workload.ordinal %arg8, 0 : index - %arg9_0 = flow.dispatch.workload.ordinal %arg9, 1 : index - %arg10_0 = flow.dispatch.workload.ordinal %arg10, 2 : index - %arg11_0 = flow.dispatch.workload.ordinal %arg11, 3 : index - %4 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor>{%arg8_0, %arg9_0} - %5 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor>{%arg10_0, %arg11_0} - %6 = flow.dispatch.tie_shape %arg12 : !flow.dispatch.tensor>>{%arg10_0, %arg11_0} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%arg8_0, %arg9_0], strides = [1, 1] : !flow.dispatch.tensor>{%arg8_0, %arg9_0} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%arg10_0, %arg11_0], strides = [1, 1] : !flow.dispatch.tensor>{%arg10_0, %arg11_0} -> tensor - %mapped = linalg.map { math.absf } ins(%7 : tensor) outs(%8 : tensor) - %9 = iree_linalg_ext.set_encoding %mapped : tensor -> tensor> - flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%arg10_0, %arg11_0], strides = [1, 1] : tensor> -> !flow.dispatch.tensor>>{%arg10, %arg11} - flow.return - } count(%arg6: index, %arg7: index, %arg8: index, %arg9: index) -> (index, index, index) { - %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg6, %arg7, %arg8, %arg9 - flow.return %x, %y, %z : index, index, index - } - %3 = flow.dispatch.workgroups[%arg4, %arg5](%2, %arg4, %arg5) : (tensor>{%arg4, %arg5}, index, index) -> tensor{%arg4, %arg5} = - (%arg6: !flow.dispatch.tensor>>, %arg7: index, %arg8: index, %arg9: !flow.dispatch.tensor>) { - %4 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor>>{%arg7, %arg8} - %5 = flow.dispatch.tie_shape %arg9 : !flow.dispatch.tensor>{%arg7, %arg8} - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor>>{%arg7, %arg8} -> tensor> - %7 = iree_linalg_ext.unset_encoding %6 : tensor> -> tensor - flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%arg7, %arg8} - flow.return - } count(%arg6: index, %arg7: index) -> (index, index, index) { - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg6, %arg7 - flow.return %x, %y, %z : index, index, index - } - return %3, %arg1, %arg2 : tensor, index, index -} - -// ----- - -// iree_linalg_ext ops get a heuristics-driven summary in their name. - -// CHECK: flow.executable private @main_dispatch_0 { -// CHECK-NEXT: flow.executable.export public @main_dispatch_0_softmax_7xf32 -// CHECK: func.func @main_dispatch_0_softmax_7xf32( -func.func @main(%arg0: tensor<7xf32>) -> tensor<7xf32> { - %c7 = arith.constant 7 : index - %0 = flow.dispatch.workgroups[%c7](%arg0) : (tensor<7xf32>) -> tensor<7xf32> = - (%arg1: !flow.dispatch.tensor>, %arg2: !flow.dispatch.tensor>) { - %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [7], strides = [1] : !flow.dispatch.tensor> -> tensor<7xf32> - %2 = tensor.empty() : tensor<7xf32> - %3 = linalg.softmax dimension(0) ins(%1 : tensor<7xf32>) outs(%2 : tensor<7xf32>) -> tensor<7xf32> - flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor> - flow.return - } count(%arg1: index) -> (index, index, index) { - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 - flow.return %x, %y, %z : index, index, index - } - return %0 : tensor<7xf32> -}