Integrate LLVM 2023-09-27 (iree-org#15048)

- Renames transform.tile_using_* ops to transform.tile_to_*. - vector.extract {{.*}} : [type from] vector<shapextype> Co-authored-by: Groverkss <[email protected]> Co-authored-by: Quinn Dawkins <[email protected]> Co-authored-by: MaheshRavishankar <[email protected]> Co-authored-by: Jakub Kuderski <[email protected]>
nod-ai · Oct 2, 2023 · 4e6d841 · 4e6d841
1 parent 83df8c4
commit 4e6d841
Show file tree

Hide file tree

Showing 55 changed files with 418 additions and 407 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
@@ -140,11 +140,15 @@ class TileConsumerAndFuseInputProducer final
 
     // Fuse the candidate immeidate operands into the tiled loop.
     OpBuilder::InsertionGuard guard(rewriter);
+    auto forLoops =
+        llvm::to_vector(llvm::map_range(tilingResult->loops, [](Operation *op) {
+          return cast<scf::ForOp>(op);
+        }));
     while (!candidates.empty()) {
       tensor::ExtractSliceOp sliceOp = candidates.back();
       candidates.pop_back();
       std::optional<scf::SCFFuseProducerOfSliceResult> result =
-          tileAndFuseProducerOfSlice(rewriter, sliceOp, tilingResult->loops);
+          tileAndFuseProducerOfSlice(rewriter, sliceOp, forLoops);
       if (result) {
         // Mark the fused input producer for distribution when writing to shared
         // memory. We cannot use the current matmul op's tiling scheme here
@@ -156,6 +160,8 @@ class TileConsumerAndFuseInputProducer final
             rewriter, result->tiledAndFusedProducer.getDefiningOp());
       }
     }
+    tilingResult->loops = llvm::to_vector(
+        llvm::map_range(forLoops, [](auto op) -> Operation * { return op; }));
     return tilingResult;
   }
 
@@ -304,10 +310,10 @@ static LogicalResult tileAndUnrollConv(func::FuncOp funcOp) {
     // Fully unroll the generated loop. This allows us to remove the loop
     // for parallel output window dimension, so it helps future vector
     // transformations.
-    ArrayRef<scf::ForOp> loops = tileAndFuseResult.value().loops;
+    ArrayRef<Operation *> loops = tileAndFuseResult.value().loops;
     if (!loops.empty()) {
       assert(loops.size() == 1);
-      scf::ForOp loopOp = loops.front();
+      scf::ForOp loopOp = cast<scf::ForOp>(loops.front());
       IntegerAttr ub;
       if (!matchPattern(loopOp.getUpperBound(), m_Constant(&ub))) {
         loopOp.emitOpError("upper bound should be a constant");

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
@@ -387,7 +387,7 @@ def ForallToWorkgroupOp : Op<Transform_Dialect,
     This region may require arbitrary computations and cannot magically match
     what the `stream.cmd.dispatch` has already imposed on us at a distance.
     For now we must specify the number of values properly when applying the
-    topLevel tile_to_forall_op.
+    topLevel tile_using_forall.
 
     If the unique topLevel scf.forall operation contained within the
     FuncOp referred to by the `target` transform handle lowers to workgroup

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/forop_canonicalization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/forop_canonicalization.mlir
@@ -68,12 +68,12 @@ func.func @loop_carried_extract(%arg0: f32) -> f32 {
   %c10 = arith.constant 10 : index
   %0 = vector.broadcast %arg0 : f32 to vector<4xf32>
   %20 = scf.for %arg3 = %c0 to %c10 step %c1 iter_args(%arg4 = %0) -> (vector<4xf32>) {
-    %a = vector.extract %arg4[0] : vector<4xf32>
+    %a = vector.extract %arg4[0] : f32 from vector<4xf32>
     %c = arith.addf %a, %a : f32
     %bc = vector.broadcast %c : f32 to vector<4xf32>
     scf.yield %bc : vector<4xf32>
   }
-  %21 = vector.extract %20[0] : vector<4xf32>
+  %21 = vector.extract %20[0] : f32 from vector<4xf32>
   return %21 : f32
 }
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
@@ -15,7 +15,7 @@ transform.sequence failures(propagate) {
   // Step 1. Map to a single block by tiling with size 1 and fusing.
   %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op
     : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %grid_loop, %outer_tiled = transform.structured.tile_to_forall_op %fusion_root_1 tile_sizes [1]
+  %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
     ( mapping = [#gpu.block<x>] )
     : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -45,17 +45,17 @@ transform.sequence failures(propagate) {
   // ===========================================================================
   %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2
     : !transform.any_op
-  %block_loop_22, %fusion_root_22_tiled =
-    transform.structured.tile_to_forall_op %outer_tiled
+  %fusion_root_22_tiled, %block_loop_22 =
+    transform.structured.tile_using_forall %outer_tiled
     tile_sizes [1] ( mapping = [#gpu.thread<z>] )
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
 
   %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
     : !transform.any_op
-  %block_loop_21, %fusion_root_21_tiled =
-    transform.structured.tile_to_forall_op %parallel_reduction_2
+  %fusion_root_21_tiled, %block_loop_21 =
+    transform.structured.tile_using_forall %parallel_reduction_2
     tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
     : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp
@@ -113,6 +113,8 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
   if (failed(tilingResult)) {
     return failure();
   }
+  auto forLoops = llvm::to_vector(llvm::map_range(
+      tilingResult->loops, [](Operation *op) { return cast<scf::ForOp>(op); }));
   yieldedValuesToOrigValues.append(rootOp->result_begin(),
                                    rootOp->result_end());
   // A map from untiled value to scf.for iter_arg. The iter_arg is used for DPS
@@ -129,9 +131,9 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
       tilingResult->tiledOps[0] = replacementTiledOp.value();
     }
   } else if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(rootOp)) {
-    for (auto [init, iterArg] :
-         llvm::zip_equal(dpsOp.getDpsInits(),
-                         tilingResult->loops.back().getRegionIterArgs())) {
+    for (auto [init, iterArg] : llvm::zip_equal(
+             dpsOp.getDpsInits(),
+             cast<scf::ForOp>(forLoops.back()).getRegionIterArgs())) {
       mapToIterArg[init] = iterArg;
     }
   }
@@ -174,20 +176,18 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
 
     // Materialize the slice of the producer in place.
     std::optional<scf::SCFFuseProducerOfSliceResult> fusedProducer =
-        tileAndFuseProducerOfSlice(rewriter, candidateSliceOp,
-                                   tilingResult->loops);
+        tileAndFuseProducerOfSlice(rewriter, candidateSliceOp, forLoops);
     if (!fusedProducer)
       continue;
 
     // Check if the fused producer has other uses that require the value
     // to be yielded from within the tiled loop.
     OpResult untiledProducer = fusedProducer->origProducer;
     if (llvm::any_of(untiledProducer.getUsers(), [&](Operation *user) {
-          return !isIgnoredUser(user, tilingResult->loops.front());
+          return !isIgnoredUser(user, forLoops.front());
         })) {
       yieldReplacementForFusedProducer(rewriter, candidateSliceOp,
-                                       fusedProducer.value(),
-                                       tilingResult->loops);
+                                       fusedProducer.value(), forLoops);
       yieldedValuesToOrigValues.push_back(untiledProducer);
     }
 
@@ -198,7 +198,7 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp,
     }
   }
 
-  scf::ForOp outermostLoop = tilingResult->loops.front();
+  scf::ForOp outermostLoop = forLoops.front();
   for (auto [index, origVal] : llvm::enumerate(yieldedValuesToOrigValues)) {
     Value replacement = outermostLoop.getResult(index);
     rewriter.replaceUsesWithIf(origVal, replacement, [&](OpOperand &use) {

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir
@@ -52,8 +52,8 @@ transform.sequence failures(propagate) {
   %original_matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
     : (!transform.any_op) -> !transform.any_op
 
-  %forall, %matmul =
-    transform.structured.tile_to_forall_op %original_matmul num_threads [32]
+  %matmul, %forall =
+    transform.structured.tile_using_forall %original_matmul num_threads [32]
       ( mapping = [#gpu.block<x>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -113,7 +113,7 @@ hal.executable private @matmul_static_dispatch_0 {
 transform.sequence failures(propagate) {
 ^bb1(%variant_op: !transform.any_op):
   %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_op, %tiled_op = transform.structured.tile_to_forall_op %1   num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %tiled_op, %forall_op = transform.structured.tile_using_forall %1   num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block<x>, #gpu.block<y>, #gpu.block<z>]): (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
 }
 
@@ -163,6 +163,6 @@ hal.executable private @matmul_static_dispatch_0 {
 transform.sequence failures(propagate) {
 ^bb1(%variant_op: !transform.any_op):
   %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_op, %tiled_op = transform.structured.tile_to_forall_op %1   num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %tiled_op, %forall_op = transform.structured.tile_using_forall %1   num_threads [] tile_sizes [5, 3](mapping = [#gpu.block<z>, #gpu.block<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> ()
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir
@@ -5,14 +5,14 @@
 // CHECK-SAME:          %[[RHS:[^:[:space:]]+]]
 // CHECK-SAME:          %[[ACC:[^:[:space:]]+]]
 // CHECK-DAG:       %[[ZERO:.*]]          = arith.constant dense<0> : vector<4x4xi8>
-// CHECK-DAG:       %[[ACC_ROW_0:.*]]     = vector.extract %[[ACC]][0] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_1:.*]]     = vector.extract %[[ACC]][1] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_2:.*]]     = vector.extract %[[ACC]][2] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_3:.*]]     = vector.extract %[[ACC]][3] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_4:.*]]     = vector.extract %[[ACC]][4] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_5:.*]]     = vector.extract %[[ACC]][5] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_6:.*]]     = vector.extract %[[ACC]][6] : vector<8x8xi32>
-// CHECK-DAG:       %[[ACC_ROW_7:.*]]     = vector.extract %[[ACC]][7] : vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_0:.*]]     = vector.extract %[[ACC]][0] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_1:.*]]     = vector.extract %[[ACC]][1] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_2:.*]]     = vector.extract %[[ACC]][2] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_3:.*]]     = vector.extract %[[ACC]][3] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_4:.*]]     = vector.extract %[[ACC]][4] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_5:.*]]     = vector.extract %[[ACC]][5] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_6:.*]]     = vector.extract %[[ACC]][6] : vector<8xi32> from vector<8x8xi32>
+// CHECK-DAG:       %[[ACC_ROW_7:.*]]     = vector.extract %[[ACC]][7] : vector<8xi32> from vector<8x8xi32>
 // CHECK-DAG:       %[[ACC_CHUNK_00:.*]]  = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [0]
 // CHECK-DAG:       %[[ACC_CHUNK_01:.*]]  = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [4]
 // CHECK-DAG:       %[[ACC_CHUNK_02:.*]]  = vector.extract_strided_slice %[[ACC_ROW_1]] {offsets = [0]

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
@@ -38,15 +38,15 @@ transform.sequence failures(propagate) {
 
   // Tile and distribute to workgroups
   // ==========================================
-  %forall_grid, %tiled_attention =
-  transform.structured.tile_to_forall_op %attention tile_sizes [1, 128]
+  %tiled_attention, %forall_grid =
+  transform.structured.tile_using_forall %attention tile_sizes [1, 128]
     ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
   // Tile batch dimensions of attention
   // ==========================================
   %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %batch_tiled_attn, %loop = transform.structured.tile %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
   transform.apply_patterns to %top_level_func {
     transform.apply_patterns.canonicalization
@@ -76,7 +76,7 @@ transform.sequence failures(propagate) {
 
   // Tile and fuse attention ops
   // ==========================================
-  %forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
   %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
   %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -101,7 +101,7 @@ transform.sequence failures(propagate) {
   // Distribute fills and last truncate
   // ==========================================
   %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
-  %fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
   // Vectorize function
   // ==========================================