Skip to content

Commit

Permalink
Integrate LLVM 2023-09-27 (iree-org#15048)
Browse files Browse the repository at this point in the history
- Renames transform.tile_using_* ops to transform.tile_to_*.
- vector.extract {{.*}} : [type from] vector<shapextype>

Co-authored-by: Groverkss <[email protected]>
Co-authored-by: Quinn Dawkins <[email protected]>
Co-authored-by: MaheshRavishankar <[email protected]>
Co-authored-by: Jakub Kuderski <[email protected]>
  • Loading branch information
5 people authored Oct 2, 2023
1 parent 83df8c4 commit 4e6d841
Show file tree
Hide file tree
Showing 55 changed files with 418 additions and 407 deletions.
12 changes: 9 additions & 3 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,15 @@ class TileConsumerAndFuseInputProducer final

// Fuse the candidate immeidate operands into the tiled loop.
OpBuilder::InsertionGuard guard(rewriter);
auto forLoops =
llvm::to_vector(llvm::map_range(tilingResult->loops, [](Operation *op) {
return cast<scf::ForOp>(op);
}));
while (!candidates.empty()) {
tensor::ExtractSliceOp sliceOp = candidates.back();
candidates.pop_back();
std::optional<scf::SCFFuseProducerOfSliceResult> result =
tileAndFuseProducerOfSlice(rewriter, sliceOp, tilingResult->loops);
tileAndFuseProducerOfSlice(rewriter, sliceOp, forLoops);
if (result) {
// Mark the fused input producer for distribution when writing to shared
// memory. We cannot use the current matmul op's tiling scheme here
Expand All @@ -156,6 +160,8 @@ class TileConsumerAndFuseInputProducer final
rewriter, result->tiledAndFusedProducer.getDefiningOp());
}
}
tilingResult->loops = llvm::to_vector(
llvm::map_range(forLoops, [](auto op) -> Operation * { return op; }));
return tilingResult;
}

Expand Down Expand Up @@ -304,10 +310,10 @@ static LogicalResult tileAndUnrollConv(func::FuncOp funcOp) {
// Fully unroll the generated loop. This allows us to remove the loop
// for parallel output window dimension, so it helps future vector
// transformations.
ArrayRef<scf::ForOp> loops = tileAndFuseResult.value().loops;
ArrayRef<Operation *> loops = tileAndFuseResult.value().loops;
if (!loops.empty()) {
assert(loops.size() == 1);
scf::ForOp loopOp = loops.front();
scf::ForOp loopOp = cast<scf::ForOp>(loops.front());
IntegerAttr ub;
if (!matchPattern(loopOp.getUpperBound(), m_Constant(&ub))) {
loopOp.emitOpError("upper bound should be a constant");
Expand Down
256 changes: 128 additions & 128 deletions compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def ForallToWorkgroupOp : Op<Transform_Dialect,
This region may require arbitrary computations and cannot magically match
what the `stream.cmd.dispatch` has already imposed on us at a distance.
For now we must specify the number of values properly when applying the
topLevel tile_to_forall_op.
topLevel tile_using_forall.

If the unique topLevel scf.forall operation contained within the
FuncOp referred to by the `target` transform handle lowers to workgroup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ func.func @loop_carried_extract(%arg0: f32) -> f32 {
%c10 = arith.constant 10 : index
%0 = vector.broadcast %arg0 : f32 to vector<4xf32>
%20 = scf.for %arg3 = %c0 to %c10 step %c1 iter_args(%arg4 = %0) -> (vector<4xf32>) {
%a = vector.extract %arg4[0] : vector<4xf32>
%a = vector.extract %arg4[0] : f32 from vector<4xf32>
%c = arith.addf %a, %a : f32
%bc = vector.broadcast %c : f32 to vector<4xf32>
scf.yield %bc : vector<4xf32>
}
%21 = vector.extract %20[0] : vector<4xf32>
%21 = vector.extract %20[0] : f32 from vector<4xf32>
return %21 : f32
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ transform.sequence failures(propagate) {
// Step 1. Map to a single block by tiling with size 1 and fusing.
%fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
: (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%grid_loop, %outer_tiled = transform.structured.tile_to_forall_op %fusion_root_1 tile_sizes [1]
%outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
( mapping = [#gpu.block<x>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

Expand Down Expand Up @@ -45,17 +45,17 @@ transform.sequence failures(propagate) {
// ===========================================================================
%fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
: !transform.any_op
%block_loop_22, %fusion_root_22_tiled =
transform.structured.tile_to_forall_op %outer_tiled
%fusion_root_22_tiled, %block_loop_22 =
transform.structured.tile_using_forall %outer_tiled
tile_sizes [1] ( mapping = [#gpu.thread<z>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)


%fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
: !transform.any_op
%block_loop_21, %fusion_root_21_tiled =
transform.structured.tile_to_forall_op %parallel_reduction_2
%fusion_root_21_tiled, %block_loop_21 =
transform.structured.tile_using_forall %parallel_reduction_2
tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
if (failed(tilingResult)) {
return failure();
}
auto forLoops = llvm::to_vector(llvm::map_range(
tilingResult->loops, [](Operation *op) { return cast<scf::ForOp>(op); }));
yieldedValuesToOrigValues.append(rootOp->result_begin(),
rootOp->result_end());
// A map from untiled value to scf.for iter_arg. The iter_arg is used for DPS
Expand All @@ -129,9 +131,9 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
tilingResult->tiledOps[0] = replacementTiledOp.value();
}
} else if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(rootOp)) {
for (auto [init, iterArg] :
llvm::zip_equal(dpsOp.getDpsInits(),
tilingResult->loops.back().getRegionIterArgs())) {
for (auto [init, iterArg] : llvm::zip_equal(
dpsOp.getDpsInits(),
cast<scf::ForOp>(forLoops.back()).getRegionIterArgs())) {
mapToIterArg[init] = iterArg;
}
}
Expand Down Expand Up @@ -174,20 +176,18 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,

// Materialize the slice of the producer in place.
std::optional<scf::SCFFuseProducerOfSliceResult> fusedProducer =
tileAndFuseProducerOfSlice(rewriter, candidateSliceOp,
tilingResult->loops);
tileAndFuseProducerOfSlice(rewriter, candidateSliceOp, forLoops);
if (!fusedProducer)
continue;

// Check if the fused producer has other uses that require the value
// to be yielded from within the tiled loop.
OpResult untiledProducer = fusedProducer->origProducer;
if (llvm::any_of(untiledProducer.getUsers(), [&](Operation *user) {
return !isIgnoredUser(user, tilingResult->loops.front());
return !isIgnoredUser(user, forLoops.front());
})) {
yieldReplacementForFusedProducer(rewriter, candidateSliceOp,
fusedProducer.value(),
tilingResult->loops);
fusedProducer.value(), forLoops);
yieldedValuesToOrigValues.push_back(untiledProducer);
}

Expand All @@ -198,7 +198,7 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
}
}

scf::ForOp outermostLoop = tilingResult->loops.front();
scf::ForOp outermostLoop = forLoops.front();
for (auto [index, origVal] : llvm::enumerate(yieldedValuesToOrigValues)) {
Value replacement = outermostLoop.getResult(index);
rewriter.replaceUsesWithIf(origVal, replacement, [&](OpOperand &use) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ transform.sequence failures(propagate) {
%original_matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
: (!transform.any_op) -> !transform.any_op

%forall, %matmul =
transform.structured.tile_to_forall_op %original_matmul num_threads [32]
%matmul, %forall =
transform.structured.tile_using_forall %original_matmul num_threads [32]
( mapping = [#gpu.block<x>] )
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

Expand Down Expand Up @@ -113,7 +113,7 @@ hal.executable private @matmul_static_dispatch_0 {
transform.sequence failures(propagate) {
^bb1(%variant_op: !transform.any_op):
%1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%forall_op, %tiled_op = transform.structured.tile_to_forall_op %1 num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
}

Expand Down Expand Up @@ -163,6 +163,6 @@ hal.executable private @matmul_static_dispatch_0 {
transform.sequence failures(propagate) {
^bb1(%variant_op: !transform.any_op):
%1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%forall_op, %tiled_op = transform.structured.tile_to_forall_op %1 num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
// CHECK-SAME: %[[RHS:[^:[:space:]]+]]
// CHECK-SAME: %[[ACC:[^:[:space:]]+]]
// CHECK-DAG: %[[ZERO:.*]] = arith.constant dense<0> : vector<4x4xi8>
// CHECK-DAG: %[[ACC_ROW_0:.*]] = vector.extract %[[ACC]][0] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_1:.*]] = vector.extract %[[ACC]][1] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_2:.*]] = vector.extract %[[ACC]][2] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_3:.*]] = vector.extract %[[ACC]][3] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_4:.*]] = vector.extract %[[ACC]][4] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_5:.*]] = vector.extract %[[ACC]][5] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_6:.*]] = vector.extract %[[ACC]][6] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_7:.*]] = vector.extract %[[ACC]][7] : vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_0:.*]] = vector.extract %[[ACC]][0] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_1:.*]] = vector.extract %[[ACC]][1] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_2:.*]] = vector.extract %[[ACC]][2] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_3:.*]] = vector.extract %[[ACC]][3] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_4:.*]] = vector.extract %[[ACC]][4] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_5:.*]] = vector.extract %[[ACC]][5] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_6:.*]] = vector.extract %[[ACC]][6] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_ROW_7:.*]] = vector.extract %[[ACC]][7] : vector<8xi32> from vector<8x8xi32>
// CHECK-DAG: %[[ACC_CHUNK_00:.*]] = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [0]
// CHECK-DAG: %[[ACC_CHUNK_01:.*]] = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [4]
// CHECK-DAG: %[[ACC_CHUNK_02:.*]] = vector.extract_strided_slice %[[ACC_ROW_1]] {offsets = [0]
Expand Down
10 changes: 5 additions & 5 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ transform.sequence failures(propagate) {

// Tile and distribute to workgroups
// ==========================================
%forall_grid, %tiled_attention =
transform.structured.tile_to_forall_op %attention tile_sizes [1, 128]
%tiled_attention, %forall_grid =
transform.structured.tile_using_forall %attention tile_sizes [1, 128]
( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()

// Tile batch dimensions of attention
// ==========================================
%attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%batch_tiled_attn, %loop = transform.structured.tile %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %top_level_func {
transform.apply_patterns.canonicalization
Expand Down Expand Up @@ -76,7 +76,7 @@ transform.sequence failures(propagate) {

// Tile and fuse attention ops
// ==========================================
%forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

%f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
Expand All @@ -101,7 +101,7 @@ transform.sequence failures(propagate) {
// Distribute fills and last truncate
// ==========================================
%fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
%fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Vectorize function
// ==========================================
Expand Down
Loading

0 comments on commit 4e6d841

Please sign in to comment.