From fa8423786dd2c1ea11d791b47d81c157fb911671 Mon Sep 17 00:00:00 2001 From: Ean Garvey Date: Wed, 6 Sep 2023 11:34:45 -0700 Subject: [PATCH] Add pass for lowering to accel ukernels. Add lit test and fix build. Fixes to LowerToAccelUKernelPass Tweaks to LowerToAccelUKernelsPass. Add AccelMatmulExpert pass pipeline Apply clang-format to new C++ files. (#3) - Apply clangformat. use 'accel' identifier Use parameter struct calling convention Tweaks to KernelDispatch and AccelMatmulExpert pipeline Co-authored-by: Sungsoon Cho --- .../Codegen/Dialect/IREECodegenAttrs.td | 4 +- .../iree/compiler/Codegen/LLVMCPU/BUILD.bazel | 1 + .../compiler/Codegen/LLVMCPU/CMakeLists.txt | 1 + .../Codegen/LLVMCPU/KernelDispatch.cpp | 14 +- .../LLVMCPU/LLVMCPULowerExecutableTarget.cpp | 8 + .../LLVMCPU/LLVMCPULowerToAccelUKernels.cpp | 142 ++++++++++++++++++ .../iree/compiler/Codegen/LLVMCPU/Passes.cpp | 31 ++++ .../iree/compiler/Codegen/LLVMCPU/Passes.h | 8 + .../iree/compiler/Codegen/LLVMCPU/Passes.td | 8 + .../compiler/Codegen/LLVMCPU/test/BUILD.bazel | 1 + .../Codegen/LLVMCPU/test/CMakeLists.txt | 1 + .../test/lower_to_accel_ukernel_ops.mlir | 23 +++ .../custom_dispatch/cpu/plugin/CMakeLists.txt | 1 + 13 files changed, 241 insertions(+), 2 deletions(-) create mode 100644 compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToAccelUKernels.cpp create mode 100644 compiler/src/iree/compiler/Codegen/LLVMCPU/test/lower_to_accel_ukernel_ops.mlir diff --git a/compiler/src/iree/compiler/Codegen/Dialect/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/IREECodegenAttrs.td index fc90e6acf3a6f..37b0f37eaa151 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/IREECodegenAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/IREECodegenAttrs.td @@ -64,6 +64,8 @@ def SPIRV_WinogradVectorize def VMVX_Default : I32EnumAttrCase<"VMVXDefault", 300>; +def CPU_AccelMatmulExpert + : I32EnumAttrCase<"AccelMatmulExpert", 25>; def Linalg_TransformDialectCodegen : I32EnumAttrCase<"TransformDialectCodegen", 1000>; @@ -79,7 +81,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr< CPU_Default, CPU_DoubleTilingExpert, CPU_DoubleTilingPadExpert, CPU_DoubleTilingPeelingExpert, CPU_ConvTileAndDecomposeExpert, CPU_Mmt4dTilingExpert, CPU_BufferOpsTileAndVectorize, - CPU_DataTiling, + CPU_DataTiling, CPU_AccelMatmulExpert, // LLVMGPU CodeGen pipelines LLVMGPU_Default, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel index 900d220d77207..07d7d3893e66e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel @@ -57,6 +57,7 @@ iree_compiler_cc_library( "LLVMCPUEmitVectorizationRemarks.cpp", "LLVMCPULinkExecutables.cpp", "LLVMCPULowerExecutableTarget.cpp", + "LLVMCPULowerToAccelUKernels.cpp", "LLVMCPULowerToUKernels.cpp", "LLVMCPUMmt4dVectorLowering.cpp", "LLVMCPUPeel.cpp", diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt index 8d5feafc38dbb..f11f6c020d21e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt @@ -58,6 +58,7 @@ iree_cc_library( "LLVMCPUEmitVectorizationRemarks.cpp" "LLVMCPULinkExecutables.cpp" "LLVMCPULowerExecutableTarget.cpp" + "LLVMCPULowerToAccelUKernels.cpp" "LLVMCPULowerToUKernels.cpp" "LLVMCPUMmt4dVectorLowering.cpp" "LLVMCPUPeel.cpp" diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index b83ac2881433c..db3ba6d6501fa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -1173,6 +1173,18 @@ static LogicalResult setRootConfig(func::FuncOp entryPointFn, DispatchLoweringPassPipeline::Mmt4dTilingExpert); } +/// Sets the lowering configuration for dispatch region for linalg.matmul root +/// op +static LogicalResult setRootConfig(func::FuncOp entryPointFn, + linalg::MatmulOp matmulOp) { + assert(!getLoweringConfig(matmulOp) && "expected lowering_config is not set"); + SmallVector tileSizes; + tileSizes.push_back(1); + return setOpConfigAndEntryPointFnTranslation( + entryPointFn, matmulOp, tileSizes, + DispatchLoweringPassPipeline::AccelMatmulExpert); +} + /// Sets the lowering configuration for dispatch region for linalg.batch_mmt4d /// root op static LogicalResult setRootConfig(func::FuncOp entryPointFn, @@ -1995,7 +2007,7 @@ setRootConfigImpl(func::FuncOp entryPointFn, Operation *op, targetMLTransInfo); }) .Case( + linalg::Mmt4DOp, linalg::MatmulOp, linalg::BatchMmt4DOp>( [&](auto op) { return setRootConfig(entryPointFn, op); }) .Case { +public: + LLVMCPULowerToAccelUKernelsPass() = default; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override; + + LogicalResult initializeOptions(StringRef options) override { + if (failed(Pass::initializeOptions(options))) { + return failure(); + } + return success(); + } +}; + +/// Holds a function name and attributes. +struct FnNameAndDefAttrs { + std::string name; + SmallVector defAttrs; +}; + +/// Returns the function name and attributes to use for a ukernel with given +/// `ukernelName` on the target described by `targetAttr`. +static FnNameAndDefAttrs +getFnNameAndDefAttrs(const char *ukernelName, RewriterBase &rewriter, + IREE::HAL::ExecutableTargetAttr targetAttr) { + FnNameAndDefAttrs result; + result.name = ukernelName; + result.defAttrs.emplace_back( + rewriter.getStringAttr("hal.import.fields"), + rewriter.getArrayAttr({rewriter.getStringAttr("processor_data"), + rewriter.getStringAttr("processor_id")})); + result.defAttrs.emplace_back( + rewriter.getStringAttr("hal.import.cconv"), + IREE::HAL::CallingConventionAttr::get( + rewriter.getContext(), + IREE::HAL::CallingConvention::ParameterStruct)); + return result; +} + +/// Matches an (linalg.fill -> )? linalg.matmul operation sequence and converts +/// it into a iree_codegen.ukernel.generic "accel_matmul_f32" operation, that is later lowered +/// into a call to the microkernel. +static FailureOr +matchDAGForUKernel(RewriterBase &rewriter, linalg::MatmulOp op) { + Value lhs = op.getDpsInputOperand(0)->get(); + Value rhs = op.getDpsInputOperand(1)->get(); + Value out = op.getDpsInitOperand(0)->get(); + auto outType = llvm::cast(out.getType()); + + Location loc = op.getLoc(); + Value m = rewriter.create(loc, lhs, 0); + Value n = rewriter.create(loc, rhs, 0); + Value k = rewriter.create(loc, rhs, 1); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op); + auto fn = getFnNameAndDefAttrs("accel_matmul_f32", rewriter, targetAttr); + auto genericMicroKernelOp = rewriter.create( + loc, outType, fn.name, ValueRange{lhs, rhs}, out, ValueRange{m, n, k}, + /*fn_def_attrs=*/rewriter.getDictionaryAttr(fn.defAttrs), + /*strided_outer_dims=*/rewriter.getIndexAttr(0)); + return cast( + genericMicroKernelOp.getOperation()); +} + +template +struct LowerToAccelUKernelPattern : OpRewritePattern { + LowerToAccelUKernelPattern(MLIRContext *context) + : OpRewritePattern(context) {} + + LogicalResult matchAndRewrite(OpType op, + PatternRewriter &rewriter) const override { + FailureOr ukernelOp = + matchDAGForUKernel(rewriter, op); + if (failed(ukernelOp)) { + return rewriter.notifyMatchFailure( + op, "failed to find microkernel op to replace with"); + } + rewriter.replaceOp(op, ukernelOp.value()->getResults()); + return success(); + } +}; + +void LLVMCPULowerToAccelUKernelsPass::runOnOperation() { + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + // Enabling a lowering of an op to a microkernel is a trade-off between the + // potential performance advantage of a microkernel over pure code generation + // for that op, and the potential benefits of fusions. Indeed, once an op + // lowered into a microkernel, it will never be fused at any MLIR level. + // Since microkernels are linked as bitcode, they will still undergo LTO-like + // optimization in their calling contexts, but we shouldn't expect this to + // achieve similar results as fusing structured ops. + patterns.insert>(context); + if (failed( + applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) { + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr> createLLVMCPULowerToAccelUKernelsPass() { + return std::make_unique(); +} + +} // namespace iree_compiler +} // namespace mlir diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp index d442a197387c7..dae4559cc22f3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp @@ -53,6 +53,11 @@ static llvm::cl::opt clEnablePadConsumerFusion( llvm::cl::desc("Flag to enable the fusion for pad + consumer"), llvm::cl::init(false)); +static llvm::cl::opt clEnableAccelMicrokernels( + "iree-llvmcpu-enable-accel-ukernels", + llvm::cl::desc("Flag to enable lowering to accelUkernels"), + llvm::cl::init(false)); + static llvm::cl::opt clEnableMicrokernelsDecomposeLinalgGeneric( "iree-vmvx-enable-microkernels-decompose-linalg-generic", llvm::cl::desc("Enables decomposition of linalg.generic ops when " @@ -621,6 +626,32 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager, } } +void addAccelMatmulExpertPassPipeline(OpPassManager &passManager, + TilingConfig &tilingConfig, + bool enableAccelMicrokernels) { + addTileAndDistributePasses(passManager); + + OpPassManager &nestedModulePM = passManager.nest(); + + if (enableAccelMicrokernels) { + nestedModulePM.addPass(createLLVMCPULowerToAccelUKernelsPass()); + } else { + nestedModulePM.addNestedPass(createLLVMCPUTileAndFusePass( + static_cast(tilingConfig.getVectorCommonParallelLevel()))); + nestedModulePM.addNestedPass(createLLVMCPUTilePass( + static_cast(tilingConfig.getVectorReductionLevel()))); + nestedModulePM.addNestedPass( + createGenericVectorizationPass()); + nestedModulePM.addNestedPass( + createHoistRedundantVectorTransfersPass()); + } + + nestedModulePM.addNestedPass(createCanonicalizerPass()); + nestedModulePM.addNestedPass(createCSEPass()); + + addBufferizePasses(nestedModulePM); +} + void addCPUDataTilingPipeline(OpPassManager &passManager, TilingConfig &tilingConfig, bool enableVectorMasking) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h index 47dad29749e12..d2e51001f3ecb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h @@ -48,6 +48,10 @@ std::unique_ptr createExpandF16OpToF32Pass(); std::unique_ptr> createLLVMCPULowerToUKernelsPass(bool skipIntermediateRoundings = true); +/// Pass to lower a sequence of operations to a iree_codegen.ukernel.* +/// operation. +std::unique_ptr> createLLVMCPULowerToAccelUKernelsPass(); + std::unique_ptr> createLLVMCPUMmt4dVectorLoweringPass(); @@ -145,6 +149,10 @@ void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager, TilingConfig &tilingConfig, bool enableMicrokernels); +void addAccelMatmulExpertPassPipeline(OpPassManager &passManager, + TilingConfig &tilingConfig, + bool enableAccelMicrokernels); + void addMultiTilingExpertPassPipeline( OpPassManager &passManager, TilingConfig &tilingConfig, bool enablePeeling, bool enableVectorMasking, bool lowerToAVX2, bool enableAArch64SSVE = false); diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td index 69fc43ffc07dd..da886893d5bf3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td @@ -86,6 +86,14 @@ def LLVMCPULowerToUKernels : ]; } +def LLVMCPULowerToAccelUKernels : + Pass<"iree-llvmcpu-lower-to-accel-ukernels", ""> { + let summary = + "Separate out parts of the IR that lower to an accel-micro-kernel"; + let constructor = + "mlir::iree_compiler::createLLVMCPULowerToAccelUKernelsPass()"; +} + def LLVMCPUMmt4dVectorLowering : Pass<"iree-llvmcpu-mmt4d-vector-lowering", "func::FuncOp"> { let summary = "Apply vector lowering logic to vector ops"; diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel index 398c6ac28c26b..a970a9eb7eea9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel @@ -35,6 +35,7 @@ iree_lit_test_suite( "hal_interface_constants.mlir", "hal_interface_workgroup_info.mlir", "illegal_configuration.mlir", + "lower_to_accel_ukernel_ops.mlir", "lower_to_ukernel_ops.mlir", "materialize_aarch64_launch_configuration.mlir", "materialize_configuration_without_distribution.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt index 0c2f1d9002cdd..217d6afd69c92 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt @@ -30,6 +30,7 @@ iree_lit_test_suite( "hal_interface_constants.mlir" "hal_interface_workgroup_info.mlir" "illegal_configuration.mlir" + "lower_to_accel_ukernel_ops.mlir" "lower_to_ukernel_ops.mlir" "materialize_aarch64_launch_configuration.mlir" "materialize_configuration_without_distribution.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/lower_to_accel_ukernel_ops.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/lower_to_accel_ukernel_ops.mlir new file mode 100644 index 0000000000000..e41343f394d54 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/lower_to_accel_ukernel_ops.mlir @@ -0,0 +1,23 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-lower-to-accel-ukernels,cse,canonicalize))" %s | FileCheck %s + +func.func @matmul_f32f32f32(%arg0 : tensor, %arg1 : tensor, %arg2 : tensor) -> tensor { + %0 = linalg.matmul ins(%arg0, %arg1 : tensor, tensor) + outs(%arg2 : tensor) -> tensor + return %0 : tensor +} +// CHECK: func @matmul_f32f32f32( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 +// CHECK-DAG: %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]] +// CHECK-DAG: %[[N:.+]] = tensor.dim %[[ARG1]], %[[C0]] +// CHECK-DAG: %[[K:.+]] = tensor.dim %[[ARG1]], %[[C1]] +// CHECK: %[[MICRO_KERNEL:.+]] = iree_codegen.ukernel.generic "aie_matmul_f32" +// CHECK-SAME: ins(%[[ARG0]], %[[ARG1]] : +// CHECK-SAME: outs(%[[ARG2]] : +// CHECK-SAME: (%[[M]], %[[N]], %[[K]] : +// CHECK-DAG: "processor_id" +// CHECK-DAG: "processor_data" +// CHECK: return %[[MICRO_KERNEL]] diff --git a/samples/custom_dispatch/cpu/plugin/CMakeLists.txt b/samples/custom_dispatch/cpu/plugin/CMakeLists.txt index 59a5c793f998e..cb675f0fcf08a 100644 --- a/samples/custom_dispatch/cpu/plugin/CMakeLists.txt +++ b/samples/custom_dispatch/cpu/plugin/CMakeLists.txt @@ -21,6 +21,7 @@ target_include_directories(iree_samples_custom_dispatch_cpu_system_plugin ${IREE_SOURCE_DIR}/runtime/src/ ) +iree_add_all_subdirs() # NOTE: this is only required because we want this sample to run on all # platforms without needing to change the library name (libfoo.so/foo.dll). set_target_properties(iree_samples_custom_dispatch_cpu_system_plugin