diff --git a/include/TPP/Passes.h b/include/TPP/Passes.h index 3ad0a62da..9aaa09968 100644 --- a/include/TPP/Passes.h +++ b/include/TPP/Passes.h @@ -22,6 +22,10 @@ namespace arith { class ArithDialect; } // namespace arith +namespace async { +class AsyncDialect; +} // namespace async + namespace check { class CheckDialect; } // namespace check @@ -56,6 +60,10 @@ namespace memref { class MemRefDialect; } // namespace memref +namespace omp { +class OpenMPDialect; +} // namespace omp + namespace perf { class PerfDialect; } // namespace perf diff --git a/include/TPP/Passes.td b/include/TPP/Passes.td index 82291dbef..6f1c25eea 100644 --- a/include/TPP/Passes.td +++ b/include/TPP/Passes.td @@ -11,6 +11,23 @@ include "mlir/Pass/PassBase.td" +def LoadTppDialects : Pass<"load-tpp-dialects", "ModuleOp"> { + let summary = "Pre-load all TPP-specific dialects"; + let description = [{ + Pre-load dialects that -transform-interpreter would try to load at runtime. + + The issue is that -transform-interpreter runs inside the multi-threaded + passmanager. Hence when the interpreter dynamically tries to load dependent + dialects this triggers an assert as loading during multi-threaded execution + could lead to concurrency issues. + }]; + let dependentDialects = ["xsmm::XsmmDialect", + "check::CheckDialect", + "perf::PerfDialect", + "omp::OpenMPDialect", + "async::AsyncDialect"]; +} + def ConvertLinalgToXsmm : Pass<"convert-linalg-to-xsmm", "func::FuncOp"> { let summary = "Convert linalg to xsmm"; let description = [{ diff --git a/lib/TPP/CMakeLists.txt b/lib/TPP/CMakeLists.txt index 6911db9dd..31b6d8a0b 100644 --- a/lib/TPP/CMakeLists.txt +++ b/lib/TPP/CMakeLists.txt @@ -12,6 +12,7 @@ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) add_mlir_library(TPPPipeline DefaultPipeline.cpp DefaultTppPasses.cpp + LoadTppDialects.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/TPP diff --git a/lib/TPP/LoadTppDialects.cpp b/lib/TPP/LoadTppDialects.cpp new file mode 100644 index 000000000..e747d3044 --- /dev/null +++ b/lib/TPP/LoadTppDialects.cpp @@ -0,0 +1,41 @@ +//===- LoadTppDialects.cpp -----------------------------------------*- C++-*-===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Pass is a no-op as it is only used for the side-effect of loading dialects. +// +//===----------------------------------------------------------------------===// +#include "mlir/Pass/Pass.h" +#include "mlir/IR/BuiltinOps.h" +#include "TPP/Dialect/Check/CheckDialect.h" +#include "TPP/Dialect/Perf/PerfDialect.h" +#include "TPP/Dialect/Xsmm/XsmmDialect.h" +#include "mlir/Dialect/Async/IR/Async.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" + + +namespace mlir { +namespace tpp { +#define GEN_PASS_DEF_LOADTPPDIALECTS +#include "TPP/Passes.h.inc" +} // namespace tpp +} // namespace mlir + +using namespace mlir; +using namespace std; + +namespace mlir { +namespace tpp { +struct LoadTppDialects + : public impl::LoadTppDialectsBase { + void runOnOperation() override {} +}; +} // namespace tpp +} // namespace mlir + diff --git a/tools/tpp-opt/tpp-opt.py b/tools/tpp-opt/tpp-opt.py new file mode 100755 index 000000000..63387bc83 --- /dev/null +++ b/tools/tpp-opt/tpp-opt.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python + +import sys +from enum import Enum +from typing import Optional, Sequence +from argparse import ArgumentParser + +from mlir import ir +from mlir.ir import Context, Location, InsertionPoint +from mlir.dialects import transform +from mlir.dialects.transform import structured + + +GpuBackend = Enum("GpuBackend", [("intel", "intel"), ("cuda", "cuda")]) + + +# Wrapper to addresss verbosity +def ApplyRegisteredPass(*args, **kwargs): + return transform.ApplyRegisteredPassOp( + transform.AnyOpType.get(), *args, **kwargs + ) + + +# Wrapper to addresss verbosity +def Match(*args, **kwargs): + return structured.MatchOp(transform.AnyOpType.get(), *args, **kwargs) + + +# TODO: consider making into a NamedSequence to call with IncludeOp +def CleanUp(op): + op = ApplyRegisteredPass(op, "canonicalize") + transform.ApplyCommonSubexpressionEliminationOp(op) + return op + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def TppMapping( + mod, lower_pack_unpack_without_transpose: bool = False, **_config +): + # Preprocess convolutions. + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "conv-init-simplify") + mod = CleanUp(mod) + # Convert ops to packed layouts. + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "pack-conv2DNchwFchw") + func = ApplyRegisteredPass(func, "pack-conv2DNhwcHwcf") + func = ApplyRegisteredPass(func, "rewrite-conv-to-matmul-or-brgemm") + func = ApplyRegisteredPass(func, "pack-matmul") + ApplyRegisteredPass(func, "pack-vnni") + if lower_pack_unpack_without_transpose: + mod = ApplyRegisteredPass(mod, "lower-packs-unpacks-without-transpose") + # Postprocess packing. + # Run only canonicalizer at this stage as full cleanup (mostly CSE) can + # mess up tensor producer-consumer chains used for analysis in the + # following passes. + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "propagate-pack-and-unpack") + mod = ApplyRegisteredPass(mod, "constant-fold-pack") + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "simplify-pack") + ApplyRegisteredPass(func, "linalg-generalize-named-ops") + mod = CleanUp(mod) + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass( + func, "linalg-convert-compare-select-to-maximumf-pass" + ) + func = ApplyRegisteredPass(func, "tile-consumer-and-fuse-producers") + ApplyRegisteredPass(func, "simplify-pack") + mod = CleanUp(mod) + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def LinalgLowering(mod, /, *, skip_operations: Sequence[str] = None, **_config): + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass( + func, + "convert-linalg-to-xsmm", + options="skip-operations=" + ",".join(skip_operations), + ) + func = ApplyRegisteredPass(func, "combine-xsmm-op-optimization") + func = ApplyRegisteredPass(func, "fold-xsmm-flags") + ApplyRegisteredPass(func, "verify-xsmm-calls") + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def VectorToXsmm(mod, **_config): + mod = ApplyRegisteredPass(mod, "vector-to-xsmm") + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def VectorToKernel(mod, **_config): + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "hoist-vector-transfer") + func = ApplyRegisteredPass(func, "canonicalize") + ApplyRegisteredPass(func, "vector-contract-to-fma") + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def LowLevelParallelization( + mod, + /, + *, + parallel_task_grid: Sequence[int], # NB: should be `Seq["certain pos ints"]` + **_config, +): + # Note that LICM should be performed before any function calls are generated + # to ensure that ops which map directly to functions also get moved outside + # of loops, if possible. This approach assumes that the function calls do + # not have any side effects and can be safely moved outside of loop body. + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "loop-invariant-code-motion") + ApplyRegisteredPass(func, "hoist-vector-transfer") + # Run cleanup after LICM to allow CSE to eliminate common operations now + # that they are hoisted out of loops. + mod = CleanUp(mod) + tile_sizes = ",".join(str(n) for n in parallel_task_grid) + mod = ApplyRegisteredPass( + mod, + "scf-parallel-loop-tiling", + options=f"parallel-loop-tile-sizes={tile_sizes}", + ) + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def LocalDialectsLowering(mod, **_config): + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "convert-check-to-loops") + ApplyRegisteredPass(func, "convert-perf-to-loops") + mod = ApplyRegisteredPass(mod, "convert-perf-to-func") + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def PostProcessing(mod, /, **_config): + # Postprocess buffers. + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "buffer-hoisting") + mod = CleanUp(mod) + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def DefaultTpp( + mod, + /, + *, + linalg_to_vector: bool = False, + vector_to_xsmm: bool = False, + vector_to_kernel: bool = False, + linalg_to_loops: bool = False, + lhs_tile: Sequence[int] = None, # NB: should be `Seq["certain pos ints"]` + rhs_tile: Sequence[int] = None, # NB: should be `Seq["certain pos ints"]` + **config, +): + # We currently have four flows: + # * linalg-to-xsmm: linalg to XSMM-calls patterns -- the default. + # * linalg-to-vector: no changes at linalg level, lower to straight loops. + # * vector-to-xsmm: linalg-to-vector and vector to XSMM-calls patterns. + # * vector-to-kernel: linalg-to-vector and vector to XSMM-like micro-kernel + # patterns via specialized lowering of certain vector patterns. + if vector_to_kernel and vector_to_xsmm: + raise ValueError("XSMM and Kernel lowering are mutually exclusive") + force_linalg_to_vector = vector_to_kernel or vector_to_xsmm + + # List of operations to skip when lowering Linalg to XSMM / Kernel. + # This allows further passes to lower to vector, function, codegen + skip_ops = set() + # General linalg-to-vector choice needs to skip all XSMM matching at linalg + # level. + if linalg_to_vector or vector_to_kernel: + skip_ops |= {"all"} + elif vector_to_xsmm: + skip_ops |= {"transpose", "vnni"} + + mod = ApplyRegisteredPass(mod, "fold-add-into-dest") + if linalg_to_loops: + # Lower linalg directly to loops, skipping all TPP transformations. + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "lower-packs-unpacks") + ApplyRegisteredPass(func, "decompose-aggregated-ops") + mod = ApplyRegisteredPass(mod, "bufferize") + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "convert-linalg-to-loops") + mod = CleanUp(mod) + else: + mod = ApplyRegisteredPass(mod, "fold-into-eltwise") + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "convert-linalg-to-inplace") + + ApplyRegisteredPass(func, "rewrite-batch-matmul-to-matmul") + # Bundle of linalg-level passes to fuse and pack: + mod = TppMapping(mod, **config) # TODO: convert to called NamedSequence + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "lower-packs-unpacks") + mod = CleanUp(mod) + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "decompose-aggregated-ops") + transform.PrintOp(target=mod, name="before-bufferize") + mod = ApplyRegisteredPass(mod, "bufferize") + mod = LinalgLowering(mod, skip_operations=skip_ops, **config) + if linalg_to_vector or force_linalg_to_vector: + func = Match(mod, ops={"func.func"}) + options = "lhsTile=" + ",".join(str(n) for n in lhs_tile) + options += " " + "rhsTile=" + ",".join(str(n) for n in rhs_tile) + func = ApplyRegisteredPass(func, "brgemm-linalg-tiling", options=options) + func = ApplyRegisteredPass(func, "loop-invariant-code-motion") + ApplyRegisteredPass(func, "vectorization-pass") + # NB: canonicalizer should be after hoisting pass because + # it fuses outer tiling loops and it results in no pattern + # matching for hoisting pass. Moved inside VectorToKernel Path. + if vector_to_xsmm: + mod = VectorToXsmm(mod) + if vector_to_kernel: + mod = VectorToKernel(mod) + mod = CleanUp(mod) + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "convert-forall-to-parallel") + + if linalg_to_vector: + mod = ApplyRegisteredPass(mod, "convert-vector-to-scf") + mod = LowLevelParallelization(mod, **config) + else: + mod = LowLevelParallelization(mod, **config) + # TODO: These passes have been moved out of low level parallelization + # pass since these apply on xsmm dialect. They'll be moved back in + # subsequent commits. + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "intel-amx-tile-config-insertion-pass") + func = ApplyRegisteredPass(func, "canonicalize") + func = ApplyRegisteredPass(func, "loop-invariant-code-motion") + func = ApplyRegisteredPass(func, "canonicalize") + ApplyRegisteredPass(func, "intel-amx-tile-config-hoisting-pass") + # TODO: This pass has been moved out of LocalDialectsLowering since it is + # applicable to xsmm only. It'll be moved back in subsequent commits. + mod = ApplyRegisteredPass(mod, "convert-xsmm-to-func") + # Convert all local TPP-related dialects. + mod = LocalDialectsLowering(mod, **config) + # Clean up after the default pipeline. + mod = PostProcessing(mod, **config) + return mod + + +# TODO: make bundle into a NamedSequence to call with IncludeOp +def DefaultPipeline( + mod, + /, + *, + def_parallel: bool = False, + gpu_backend: Optional[GpuBackend] = None, + **config, +): + transform.PrintOp(target=mod) + if not gpu_backend: + mod = DefaultTpp(mod, **config) + else: + assert False, "not implemented for now" + # Bail out early for Intel GPU. The rest of the lowering is performed by IMEX. + if gpu_backend == "intel": + return mod + + # Partial lowering. + mod = ApplyRegisteredPass(mod, "expand-strided-metadata") + mod = ApplyRegisteredPass(mod, "convert-tensor-to-linalg") + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "convert-linalg-to-loops") + if def_parallel: + mod = ApplyRegisteredPass(mod, "convert-scf-to-openmp") + mod = ApplyRegisteredPass(mod, "convert-vector-to-scf") + mod = ApplyRegisteredPass(mod, "arith-expand") + mod = ApplyRegisteredPass(mod, "lower-affine") + + transform.PrintOp(target=mod, name="HERE!") + + # Lower to LLVM + mod = ApplyRegisteredPass(mod, "convert-vector-to-llvm") + mod = ApplyRegisteredPass(mod, "finalize-memref-to-llvm") + mod = ApplyRegisteredPass(mod, "convert-scf-to-cf") + if def_parallel: + mod = ApplyRegisteredPass(mod, "convert-openmp-to-llvm") + mod = ApplyRegisteredPass(mod, "convert-math-to-llvm") + + if gpu_backend: + func = Match(mod, ops={"func.func"}) + ApplyRegisteredPass(func, "gpu-async-region") + assert False + # gpu-to-llvm cannot be invoked from transform-interpreter as it + # tries to load ... something while multi-threaded PassManager is running. + mod = ApplyRegisteredPass(mod, "gpu-to-llvm") + mod = ApplyRegisteredPass( + mod, "gpu-module-to-binary", options="compilation-target=fatbin" + ) + mod = ApplyRegisteredPass(mod, "async-to-async-runtime") + mod = ApplyRegisteredPass(mod, "async-runtime-ref-counting") + mod = ApplyRegisteredPass(mod, "convert-async-to-llvm") + + mod = ApplyRegisteredPass(mod, "convert-func-to-llvm") + # FIXME: once llvm-project is updated, add -convert-arith-to-llvm and -convert-cf-to-llvm here + func = Match(mod, ops={"func.func"}) + func = ApplyRegisteredPass(func, "convert-arith-to-llvm") + # func = ApplyRegisteredPass(func, "convert-cf-to-llvm") + func = ApplyRegisteredPass(func, "canonicalize") + transform.ApplyCommonSubexpressionEliminationOp(func) + mod = ApplyRegisteredPass(mod, "reconcile-unrealized-casts") + + # Anything useful has been lowered by now. + # Cleanup IR by removing any dead symbols. + # This step aims to avoid errors caused by frontend leftovers. + # See issue: #704 + transform.ApplyDeadCodeEliminationOp(mod) + + return mod + + +def MainSchedule(**config): + module = ir.Module.create() + module.operation.attributes["transform.with_named_sequence"] = ( + ir.UnitAttr.get() + ) + with InsertionPoint(module.body): + named_sequence = transform.NamedSequenceOp( + "__transform_main", + [transform.AnyOpType.get()], # input types + [transform.AnyOpType.get()], # output types + arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], + ) + with InsertionPoint(named_sequence.body): + func = Match(named_sequence.bodyTarget, ops={"func.func"}) + mod = transform.GetParentOp( + transform.AnyOpType.get(), + func, + op_name="builtin.module", + deduplicate=True, + ) + + mod = DefaultPipeline(mod, **config) + transform.YieldOp(mod) + return module + + +def config_from_args(args: Sequence[str]): + def csints(s): + return [int(n) for n in s.split(",")] + + parser = ArgumentParser(prog="tpp-opt.py", description="TODO") + parser.add_argument( + "--gpu", choices=[o.value for o in GpuBackend], dest="gpu_backend" + ) + parser.add_argument("--parallel-task-grid", type=csints, default="2,8") + parser.add_argument("--lhs-tile", type=csints, default="8,8") + parser.add_argument("--rhs-tile", type=csints, default="8,16") + parser.add_argument("--def-parallel", action="store_true") + parser.add_argument("--vector-to-xsmm", action="store_true") + parser.add_argument("--vector-to-kernels", action="store_true") + parser.add_argument("--linalg-to-vector", action="store_true") + parser.add_argument( + "--lower-pack-unpack-without-transpose", action="store_true" + ) + + return vars(parser.parse_args(args)) + + +def main(): + config = config_from_args(sys.argv[1:]) + print(config, file=sys.stderr) + with Context(), Location.name("main_schedule"): + module = MainSchedule(**config) + + print(module) + + +if __name__ == "__main__": + main()