Skip to content

Commit

Permalink
Integrate llvm-project at fe3c23b439b9a2d00442d9bc6a4ca86f73066a3d (i…
Browse files Browse the repository at this point in the history
…ree-org#19287)

Still carrying a revert for 1004865f1ca41a9581da8747f34b29862d3ebc3d and
a cherry pick for llvm/llvm-project#116650.
  • Loading branch information
kuhar authored Nov 25, 2024
1 parent 1a7b51d commit 53e9601
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static void debugPrint(Operation *op, const char *message) {
/// Emit shared local memory allocation in case it is needed when lowering the
/// warp operations.
static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
vector::WarpExecuteOnLane0Op warpOp,
gpu::WarpExecuteOnLane0Op warpOp,
Type type) {
MemRefType memrefType;
auto addressSpaceAttr = gpu::AddressSpaceAttr::get(
Expand Down Expand Up @@ -83,8 +83,7 @@ static bool isUniformLoad(Operation *op) {

/// Hoist uniform operations as well as special hal operations that have side
/// effect but are safe to move out of the warp single lane region.
static void
moveScalarAndBindingUniformCode(vector::WarpExecuteOnLane0Op warpOp) {
static void moveScalarAndBindingUniformCode(gpu::WarpExecuteOnLane0Op warpOp) {
/// Hoist ops without side effect as well as special binding ops.
auto canBeHoisted = [](Operation *op,
function_ref<bool(Value)> definedOutside) {
Expand Down Expand Up @@ -155,12 +154,12 @@ struct InsertToBroadcast final : OpRewritePattern<vector::InsertOp> {
};

/// Pattern to sink `gpu.barrier` ops out of a `warp_execute_on_lane_0` op.
struct WarpOpBarrier final : OpRewritePattern<vector::WarpExecuteOnLane0Op> {
using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
struct WarpOpBarrier final : OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
auto yield = cast<vector::YieldOp>(
auto yield = cast<gpu::YieldOp>(
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
Operation *lastNode = yield->getPrevNode();
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
Expand Down Expand Up @@ -233,7 +232,7 @@ struct VectorReductionToGPUPass final
auto threadX = builder.create<gpu::ThreadIdOp>(loc, builder.getIndexType(),
gpu::Dimension::x);
auto cstGroupSize = builder.create<arith::ConstantIndexOp>(loc, groupSize);
auto warpOp = builder.create<vector::WarpExecuteOnLane0Op>(
auto warpOp = builder.create<gpu::WarpExecuteOnLane0Op>(
loc, TypeRange(), threadX.getResult(), groupSize);
warpOp.getWarpRegion().takeBody(funcOp.getFunctionBody());
Block &newBlock = funcOp.getFunctionBody().emplaceBlock();
Expand All @@ -243,7 +242,7 @@ struct VectorReductionToGPUPass final
warpOp.getWarpRegion().getBlocks().back().back().moveBefore(&newBlock,
newBlock.end());
builder.setInsertionPointToEnd(&warpOp.getWarpRegion().getBlocks().back());
builder.create<vector::YieldOp>(loc);
builder.create<gpu::YieldOp>(loc);

debugPrint(funcOp, "after step #2: wrapping code with the warp execute op");

Expand Down Expand Up @@ -300,7 +299,7 @@ struct VectorReductionToGPUPass final
vector::WarpExecuteOnLane0LoweringOptions options;
options.warpAllocationFn = allocateGlobalSharedMemory;
options.warpSyncronizationFn = [](Location loc, OpBuilder &builder,
vector::WarpExecuteOnLane0Op warpOp) {
gpu::WarpExecuteOnLane0Op warpOp) {
builder.create<gpu::BarrierOp>(loc);
};
vector::populateWarpExecuteOnLane0OpToScfForPattern(patterns, options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ void transform_dialect::VectorToWarpExecuteOnLane0Op::build(
// SCCP.
static LogicalResult
replaceAllUsesOfLaneWithin(RewriterBase &b,
vector::WarpExecuteOnLane0Op executeOp) {
gpu::WarpExecuteOnLane0Op executeOp) {
OpBuilder::InsertionGuard g(b);
b.setInsertionPoint(executeOp);
Value zero = b.create<arith::ConstantIndexOp>(executeOp.getLoc(), 0);
Expand Down Expand Up @@ -225,7 +225,7 @@ static FailureOr<gpu::ThreadIdOp> isThreadIdxxZeroPredicate(scf::IfOp ifOp) {
}

struct VectorDistributionResult {
vector::WarpExecuteOnLane0Op warpOp;
gpu::WarpExecuteOnLane0Op warpOp;
};

static FailureOr<VectorDistributionResult>
Expand Down Expand Up @@ -257,7 +257,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc,
rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
rewriter.setInsertionPointToStart(&newIfOp.getThenRegion().front());
}
auto warpOp = rewriter.create<vector::WarpExecuteOnLane0Op>(
auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
loc, TypeRange(), threadIdxx, warpSize);

// Move the code from the previous ifOp to the
Expand All @@ -270,7 +270,7 @@ rewriteScfIfAsWarpExecuteOnLane0(RewriterBase &rewriter, Location loc,
sourceBlock.without_terminator().begin(),
sourceBlock.without_terminator().end());
rewriter.setInsertionPointToEnd(&targetBlock);
rewriter.create<vector::YieldOp>(loc);
rewriter.create<gpu::YieldOp>(loc);

// Erase old op.
rewriter.eraseOp(ifOp);
Expand Down Expand Up @@ -358,7 +358,7 @@ void transform_dialect::VectorWarpDistributionOp::getEffects(
/// Emit shared local memory allocation in case it is needed when lowering the
/// warp operations.
static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
vector::WarpExecuteOnLane0Op warpOp,
gpu::WarpExecuteOnLane0Op warpOp,
Type type) {
MemRefType memrefType;
auto addressSpaceAttr = gpu::AddressSpaceAttr::get(
Expand All @@ -374,11 +374,11 @@ static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
return builder.create<memref::AllocOp>(loc, memrefType);
}

/// Return a value yielded by `warpOp` which statifies the filter lamdba
/// Return a value yielded by `warpOp` which satisfies the filter lambda
/// condition and is not dead.
static OpOperand *getWarpResult(vector::WarpExecuteOnLane0Op warpOp,
static OpOperand *getWarpResult(gpu::WarpExecuteOnLane0Op warpOp,
function_ref<bool(Operation *)> fn) {
auto yield = cast<vector::YieldOp>(
auto yield = cast<gpu::YieldOp>(
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
for (OpOperand &yieldOperand : yield->getOpOperands()) {
Value yieldValues = yieldOperand.get();
Expand Down Expand Up @@ -426,9 +426,9 @@ class InsertElementToBroadcast final
/// }
/// gpu.synchronize
/// %0 = memref.load %src[%c0] : memref<1024xf32>
struct WarpOpLoad : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
struct WarpOpLoad : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<memref::LoadOp>);
if (!operand)
Expand Down Expand Up @@ -476,7 +476,7 @@ struct HoistSharedMemoryAlloc : public OpRewritePattern<memref::AllocOp> {
PatternRewriter &rewriter) const override {
if (!iree_compiler::hasSharedMemoryAddressSpace(alloc.getType()))
return failure();
auto warpParent = alloc->getParentOfType<vector::WarpExecuteOnLane0Op>();
auto warpParent = alloc->getParentOfType<gpu::WarpExecuteOnLane0Op>();
if (!warpParent)
return failure();
alloc->moveBefore(warpParent);
Expand Down Expand Up @@ -561,7 +561,7 @@ static void populatePropagateVectorDistribution(Operation *target,
}

static void warpSyncronizationFn(Location loc, OpBuilder &builder,
vector::WarpExecuteOnLane0Op warpOp) {
gpu::WarpExecuteOnLane0Op warpOp) {
builder.create<gpu::BarrierOp>(loc);
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func.func @reduce_dispatch_0() attributes {translation_info = #translation_info}
// WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
// Single-warp guard filters out threads 32-63.
// WARP-EXECUTE: scf.if %[[COND32]] {
// WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] {
// WARP-EXECUTE: gpu.warp_execute_on_lane_0(%[[TIDX]])[32] {
// WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32>
// WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32>

Expand Down
2 changes: 1 addition & 1 deletion third_party/llvm-project
Submodule llvm-project updated 1295 files

0 comments on commit 53e9601

Please sign in to comment.