From 53e960146727759735815cac516683abb9bf5f86 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Mon, 25 Nov 2024 16:02:51 -0500 Subject: [PATCH] Integrate llvm-project at fe3c23b439b9a2d00442d9bc6a4ca86f73066a3d (#19287) Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and a cherry pick for https://github.com/llvm/llvm-project/pull/116650. --- .../Common/GPU/VectorReductionToGPU.cpp | 19 +++++++------- .../TransformExtensions/LLVMGPUExtensions.cpp | 26 +++++++++---------- ...transform_dialect_vector_distribution.mlir | 2 +- third_party/llvm-project | 2 +- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp index 314b5844d966..e458da23707c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp @@ -39,7 +39,7 @@ static void debugPrint(Operation *op, const char *message) { /// Emit shared local memory allocation in case it is needed when lowering the /// warp operations. static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder, - vector::WarpExecuteOnLane0Op warpOp, + gpu::WarpExecuteOnLane0Op warpOp, Type type) { MemRefType memrefType; auto addressSpaceAttr = gpu::AddressSpaceAttr::get( @@ -83,8 +83,7 @@ static bool isUniformLoad(Operation *op) { /// Hoist uniform operations as well as special hal operations that have side /// effect but are safe to move out of the warp single lane region. -static void -moveScalarAndBindingUniformCode(vector::WarpExecuteOnLane0Op warpOp) { +static void moveScalarAndBindingUniformCode(gpu::WarpExecuteOnLane0Op warpOp) { /// Hoist ops without side effect as well as special binding ops. auto canBeHoisted = [](Operation *op, function_ref definedOutside) { @@ -155,12 +154,12 @@ struct InsertToBroadcast final : OpRewritePattern { }; /// Pattern to sink `gpu.barrier` ops out of a `warp_execute_on_lane_0` op. -struct WarpOpBarrier final : OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct WarpOpBarrier final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Operation *lastNode = yield->getPrevNode(); auto barrierOp = dyn_cast_or_null(lastNode); @@ -233,7 +232,7 @@ struct VectorReductionToGPUPass final auto threadX = builder.create(loc, builder.getIndexType(), gpu::Dimension::x); auto cstGroupSize = builder.create(loc, groupSize); - auto warpOp = builder.create( + auto warpOp = builder.create( loc, TypeRange(), threadX.getResult(), groupSize); warpOp.getWarpRegion().takeBody(funcOp.getFunctionBody()); Block &newBlock = funcOp.getFunctionBody().emplaceBlock(); @@ -243,7 +242,7 @@ struct VectorReductionToGPUPass final warpOp.getWarpRegion().getBlocks().back().back().moveBefore(&newBlock, newBlock.end()); builder.setInsertionPointToEnd(&warpOp.getWarpRegion().getBlocks().back()); - builder.create(loc); + builder.create(loc); debugPrint(funcOp, "after step #2: wrapping code with the warp execute op"); @@ -300,7 +299,7 @@ struct VectorReductionToGPUPass final vector::WarpExecuteOnLane0LoweringOptions options; options.warpAllocationFn = allocateGlobalSharedMemory; options.warpSyncronizationFn = [](Location loc, OpBuilder &builder, - vector::WarpExecuteOnLane0Op warpOp) { + gpu::WarpExecuteOnLane0Op warpOp) { builder.create(loc); }; vector::populateWarpExecuteOnLane0OpToScfForPattern(patterns, options); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp index 7dc2e4093d58..c52ae4bcc157 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp @@ -153,7 +153,7 @@ void transform_dialect::VectorToWarpExecuteOnLane0Op::build( // SCCP. static LogicalResult replaceAllUsesOfLaneWithin(RewriterBase &b, - vector::WarpExecuteOnLane0Op executeOp) { + gpu::WarpExecuteOnLane0Op executeOp) { OpBuilder::InsertionGuard g(b); b.setInsertionPoint(executeOp); Value zero = b.create(executeOp.getLoc(), 0); @@ -225,7 +225,7 @@ static FailureOr isThreadIdxxZeroPredicate(scf::IfOp ifOp) { } struct VectorDistributionResult { - vector::WarpExecuteOnLane0Op warpOp; + gpu::WarpExecuteOnLane0Op warpOp; }; static FailureOr @@ -257,7 +257,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc, rewriter.create(loc, predicate, /*withElseRegion=*/false); rewriter.setInsertionPointToStart(&newIfOp.getThenRegion().front()); } - auto warpOp = rewriter.create( + auto warpOp = rewriter.create( loc, TypeRange(), threadIdxx, warpSize); // Move the code from the previous ifOp to the @@ -270,7 +270,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc, sourceBlock.without_terminator().begin(), sourceBlock.without_terminator().end()); rewriter.setInsertionPointToEnd(&targetBlock); - rewriter.create(loc); + rewriter.create(loc); // Erase old op. rewriter.eraseOp(ifOp); @@ -358,7 +358,7 @@ void transform_dialect::VectorWarpDistributionOp::getEffects( /// Emit shared local memory allocation in case it is needed when lowering the /// warp operations. static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder, - vector::WarpExecuteOnLane0Op warpOp, + gpu::WarpExecuteOnLane0Op warpOp, Type type) { MemRefType memrefType; auto addressSpaceAttr = gpu::AddressSpaceAttr::get( @@ -374,11 +374,11 @@ static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder, return builder.create(loc, memrefType); } -/// Return a value yielded by `warpOp` which statifies the filter lamdba +/// Return a value yielded by `warpOp` which satisfies the filter lambda /// condition and is not dead. -static OpOperand *getWarpResult(vector::WarpExecuteOnLane0Op warpOp, +static OpOperand *getWarpResult(gpu::WarpExecuteOnLane0Op warpOp, function_ref fn) { - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); for (OpOperand &yieldOperand : yield->getOpOperands()) { Value yieldValues = yieldOperand.get(); @@ -426,9 +426,9 @@ class InsertElementToBroadcast final /// } /// gpu.synchronize /// %0 = memref.load %src[%c0] : memref<1024xf32> -struct WarpOpLoad : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp, +struct WarpOpLoad : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred); if (!operand) @@ -476,7 +476,7 @@ struct HoistSharedMemoryAlloc : public OpRewritePattern { PatternRewriter &rewriter) const override { if (!iree_compiler::hasSharedMemoryAddressSpace(alloc.getType())) return failure(); - auto warpParent = alloc->getParentOfType(); + auto warpParent = alloc->getParentOfType(); if (!warpParent) return failure(); alloc->moveBefore(warpParent); @@ -561,7 +561,7 @@ static void populatePropagateVectorDistribution(Operation *target, } static void warpSyncronizationFn(Location loc, OpBuilder &builder, - vector::WarpExecuteOnLane0Op warpOp) { + gpu::WarpExecuteOnLane0Op warpOp) { builder.create(loc); }; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir index 1cad1aa50614..6ee43c98fcf8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir @@ -24,7 +24,7 @@ func.func @reduce_dispatch_0() attributes {translation_info = #translation_info} // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index // Single-warp guard filters out threads 32-63. // WARP-EXECUTE: scf.if %[[COND32]] { - // WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] { + // WARP-EXECUTE: gpu.warp_execute_on_lane_0(%[[TIDX]])[32] { // WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32> // WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32> diff --git a/third_party/llvm-project b/third_party/llvm-project index 58f1b107d7a3..534730273092 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 58f1b107d7a377ff6d456f16f060606ea4430041 +Subproject commit 534730273092b8e7d4bedc1a3206d76e6848c6c4