diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp index c2664ff06544..d6b400381747 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp @@ -140,11 +140,15 @@ class TileConsumerAndFuseInputProducer final // Fuse the candidate immeidate operands into the tiled loop. OpBuilder::InsertionGuard guard(rewriter); + auto forLoops = + llvm::to_vector(llvm::map_range(tilingResult->loops, [](Operation *op) { + return cast(op); + })); while (!candidates.empty()) { tensor::ExtractSliceOp sliceOp = candidates.back(); candidates.pop_back(); std::optional result = - tileAndFuseProducerOfSlice(rewriter, sliceOp, tilingResult->loops); + tileAndFuseProducerOfSlice(rewriter, sliceOp, forLoops); if (result) { // Mark the fused input producer for distribution when writing to shared // memory. We cannot use the current matmul op's tiling scheme here @@ -156,6 +160,8 @@ class TileConsumerAndFuseInputProducer final rewriter, result->tiledAndFusedProducer.getDefiningOp()); } } + tilingResult->loops = llvm::to_vector( + llvm::map_range(forLoops, [](auto op) -> Operation * { return op; })); return tilingResult; } @@ -304,10 +310,10 @@ static LogicalResult tileAndUnrollConv(func::FuncOp funcOp) { // Fully unroll the generated loop. This allows us to remove the loop // for parallel output window dimension, so it helps future vector // transformations. - ArrayRef loops = tileAndFuseResult.value().loops; + ArrayRef loops = tileAndFuseResult.value().loops; if (!loops.empty()) { assert(loops.size() == 1); - scf::ForOp loopOp = loops.front(); + scf::ForOp loopOp = cast(loops.front()); IntegerAttr ub; if (!matchPattern(loopOp.getUpperBound(), m_Constant(&ub))) { loopOp.emitOpError("upper bound should be a constant"); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir index 46fca6e226f5..e6f942c6a4d4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir @@ -267,149 +267,149 @@ func.func @nvidia_tenscore_schedule_f16() { scf.yield %278, %280, %282, %284, %286, %288, %290, %292, %293, %294, %295, %296, %297, %298, %299, %300, %301, %302, %303, %304, %305, %306, %307, %308, %309, %310, %311, %312, %313, %314, %315, %316 : vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16> } %7 = gpu.lane_id - %8 = vector.extract %6#31[0] : vector<2x2xf16> + %8 = vector.extract %6#31[0] : vector<2xf16> from vector<2x2xf16> %9 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 48)>()[%1, %7] %10 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 56)>()[%0, %7] vector.store %8, %alloc[%9, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %11 = vector.extract %6#31[1] : vector<2x2xf16> + %11 = vector.extract %6#31[1] : vector<2xf16> from vector<2x2xf16> %12 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 56)>()[%1, %7] vector.store %11, %alloc[%12, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %13 = vector.extract %6#30[0] : vector<2x2xf16> + %13 = vector.extract %6#30[0] : vector<2xf16> from vector<2x2xf16> %14 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 48)>()[%0, %7] vector.store %13, %alloc[%9, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %15 = vector.extract %6#30[1] : vector<2x2xf16> + %15 = vector.extract %6#30[1] : vector<2xf16> from vector<2x2xf16> vector.store %15, %alloc[%12, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %16 = vector.extract %6#29[0] : vector<2x2xf16> + %16 = vector.extract %6#29[0] : vector<2xf16> from vector<2x2xf16> %17 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 40)>()[%0, %7] vector.store %16, %alloc[%9, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %18 = vector.extract %6#29[1] : vector<2x2xf16> + %18 = vector.extract %6#29[1] : vector<2xf16> from vector<2x2xf16> vector.store %18, %alloc[%12, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %19 = vector.extract %6#28[0] : vector<2x2xf16> + %19 = vector.extract %6#28[0] : vector<2xf16> from vector<2x2xf16> %20 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 32)>()[%0, %7] vector.store %19, %alloc[%9, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %21 = vector.extract %6#28[1] : vector<2x2xf16> + %21 = vector.extract %6#28[1] : vector<2xf16> from vector<2x2xf16> vector.store %21, %alloc[%12, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %22 = vector.extract %6#27[0] : vector<2x2xf16> + %22 = vector.extract %6#27[0] : vector<2xf16> from vector<2x2xf16> %23 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 24)>()[%0, %7] vector.store %22, %alloc[%9, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %24 = vector.extract %6#27[1] : vector<2x2xf16> + %24 = vector.extract %6#27[1] : vector<2xf16> from vector<2x2xf16> vector.store %24, %alloc[%12, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %25 = vector.extract %6#26[0] : vector<2x2xf16> + %25 = vector.extract %6#26[0] : vector<2xf16> from vector<2x2xf16> %26 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 16)>()[%0, %7] vector.store %25, %alloc[%9, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %27 = vector.extract %6#26[1] : vector<2x2xf16> + %27 = vector.extract %6#26[1] : vector<2xf16> from vector<2x2xf16> vector.store %27, %alloc[%12, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %28 = vector.extract %6#25[0] : vector<2x2xf16> + %28 = vector.extract %6#25[0] : vector<2xf16> from vector<2x2xf16> %29 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 8)>()[%0, %7] vector.store %28, %alloc[%9, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %30 = vector.extract %6#25[1] : vector<2x2xf16> + %30 = vector.extract %6#25[1] : vector<2xf16> from vector<2x2xf16> vector.store %30, %alloc[%12, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %31 = vector.extract %6#24[0] : vector<2x2xf16> + %31 = vector.extract %6#24[0] : vector<2xf16> from vector<2x2xf16> %32 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64)>()[%0, %7] vector.store %31, %alloc[%9, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %33 = vector.extract %6#24[1] : vector<2x2xf16> + %33 = vector.extract %6#24[1] : vector<2xf16> from vector<2x2xf16> vector.store %33, %alloc[%12, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %34 = vector.extract %6#23[0] : vector<2x2xf16> + %34 = vector.extract %6#23[0] : vector<2xf16> from vector<2x2xf16> %35 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 32)>()[%1, %7] vector.store %34, %alloc[%35, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %36 = vector.extract %6#23[1] : vector<2x2xf16> + %36 = vector.extract %6#23[1] : vector<2xf16> from vector<2x2xf16> %37 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 40)>()[%1, %7] vector.store %36, %alloc[%37, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %38 = vector.extract %6#22[0] : vector<2x2xf16> + %38 = vector.extract %6#22[0] : vector<2xf16> from vector<2x2xf16> vector.store %38, %alloc[%35, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %39 = vector.extract %6#22[1] : vector<2x2xf16> + %39 = vector.extract %6#22[1] : vector<2xf16> from vector<2x2xf16> vector.store %39, %alloc[%37, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %40 = vector.extract %6#21[0] : vector<2x2xf16> + %40 = vector.extract %6#21[0] : vector<2xf16> from vector<2x2xf16> vector.store %40, %alloc[%35, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %41 = vector.extract %6#21[1] : vector<2x2xf16> + %41 = vector.extract %6#21[1] : vector<2xf16> from vector<2x2xf16> vector.store %41, %alloc[%37, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %42 = vector.extract %6#20[0] : vector<2x2xf16> + %42 = vector.extract %6#20[0] : vector<2xf16> from vector<2x2xf16> vector.store %42, %alloc[%35, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %43 = vector.extract %6#20[1] : vector<2x2xf16> + %43 = vector.extract %6#20[1] : vector<2xf16> from vector<2x2xf16> vector.store %43, %alloc[%37, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %44 = vector.extract %6#19[0] : vector<2x2xf16> + %44 = vector.extract %6#19[0] : vector<2xf16> from vector<2x2xf16> vector.store %44, %alloc[%35, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %45 = vector.extract %6#19[1] : vector<2x2xf16> + %45 = vector.extract %6#19[1] : vector<2xf16> from vector<2x2xf16> vector.store %45, %alloc[%37, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %46 = vector.extract %6#18[0] : vector<2x2xf16> + %46 = vector.extract %6#18[0] : vector<2xf16> from vector<2x2xf16> vector.store %46, %alloc[%35, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %47 = vector.extract %6#18[1] : vector<2x2xf16> + %47 = vector.extract %6#18[1] : vector<2xf16> from vector<2x2xf16> vector.store %47, %alloc[%37, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %48 = vector.extract %6#17[0] : vector<2x2xf16> + %48 = vector.extract %6#17[0] : vector<2xf16> from vector<2x2xf16> vector.store %48, %alloc[%35, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %49 = vector.extract %6#17[1] : vector<2x2xf16> + %49 = vector.extract %6#17[1] : vector<2xf16> from vector<2x2xf16> vector.store %49, %alloc[%37, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %50 = vector.extract %6#16[0] : vector<2x2xf16> + %50 = vector.extract %6#16[0] : vector<2xf16> from vector<2x2xf16> vector.store %50, %alloc[%35, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %51 = vector.extract %6#16[1] : vector<2x2xf16> + %51 = vector.extract %6#16[1] : vector<2xf16> from vector<2x2xf16> vector.store %51, %alloc[%37, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %52 = vector.extract %6#15[0] : vector<2x2xf16> + %52 = vector.extract %6#15[0] : vector<2xf16> from vector<2x2xf16> %53 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 16)>()[%1, %7] vector.store %52, %alloc[%53, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %54 = vector.extract %6#15[1] : vector<2x2xf16> + %54 = vector.extract %6#15[1] : vector<2xf16> from vector<2x2xf16> %55 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 24)>()[%1, %7] vector.store %54, %alloc[%55, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %56 = vector.extract %6#14[0] : vector<2x2xf16> + %56 = vector.extract %6#14[0] : vector<2xf16> from vector<2x2xf16> vector.store %56, %alloc[%53, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %57 = vector.extract %6#14[1] : vector<2x2xf16> + %57 = vector.extract %6#14[1] : vector<2xf16> from vector<2x2xf16> vector.store %57, %alloc[%55, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %58 = vector.extract %6#13[0] : vector<2x2xf16> + %58 = vector.extract %6#13[0] : vector<2xf16> from vector<2x2xf16> vector.store %58, %alloc[%53, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %59 = vector.extract %6#13[1] : vector<2x2xf16> + %59 = vector.extract %6#13[1] : vector<2xf16> from vector<2x2xf16> vector.store %59, %alloc[%55, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %60 = vector.extract %6#12[0] : vector<2x2xf16> + %60 = vector.extract %6#12[0] : vector<2xf16> from vector<2x2xf16> vector.store %60, %alloc[%53, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %61 = vector.extract %6#12[1] : vector<2x2xf16> + %61 = vector.extract %6#12[1] : vector<2xf16> from vector<2x2xf16> vector.store %61, %alloc[%55, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %62 = vector.extract %6#11[0] : vector<2x2xf16> + %62 = vector.extract %6#11[0] : vector<2xf16> from vector<2x2xf16> vector.store %62, %alloc[%53, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %63 = vector.extract %6#11[1] : vector<2x2xf16> + %63 = vector.extract %6#11[1] : vector<2xf16> from vector<2x2xf16> vector.store %63, %alloc[%55, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %64 = vector.extract %6#10[0] : vector<2x2xf16> + %64 = vector.extract %6#10[0] : vector<2xf16> from vector<2x2xf16> vector.store %64, %alloc[%53, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %65 = vector.extract %6#10[1] : vector<2x2xf16> + %65 = vector.extract %6#10[1] : vector<2xf16> from vector<2x2xf16> vector.store %65, %alloc[%55, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %66 = vector.extract %6#9[0] : vector<2x2xf16> + %66 = vector.extract %6#9[0] : vector<2xf16> from vector<2x2xf16> vector.store %66, %alloc[%53, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %67 = vector.extract %6#9[1] : vector<2x2xf16> + %67 = vector.extract %6#9[1] : vector<2xf16> from vector<2x2xf16> vector.store %67, %alloc[%55, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %68 = vector.extract %6#8[0] : vector<2x2xf16> + %68 = vector.extract %6#8[0] : vector<2xf16> from vector<2x2xf16> vector.store %68, %alloc[%53, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %69 = vector.extract %6#8[1] : vector<2x2xf16> + %69 = vector.extract %6#8[1] : vector<2xf16> from vector<2x2xf16> vector.store %69, %alloc[%55, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %70 = vector.extract %6#7[0] : vector<2x2xf16> + %70 = vector.extract %6#7[0] : vector<2xf16> from vector<2x2xf16> %71 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4)>()[%1, %7] vector.store %70, %alloc[%71, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %72 = vector.extract %6#7[1] : vector<2x2xf16> + %72 = vector.extract %6#7[1] : vector<2xf16> from vector<2x2xf16> %73 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 8)>()[%1, %7] vector.store %72, %alloc[%73, %10] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %74 = vector.extract %6#6[0] : vector<2x2xf16> + %74 = vector.extract %6#6[0] : vector<2xf16> from vector<2x2xf16> vector.store %74, %alloc[%71, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %75 = vector.extract %6#6[1] : vector<2x2xf16> + %75 = vector.extract %6#6[1] : vector<2xf16> from vector<2x2xf16> vector.store %75, %alloc[%73, %14] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %76 = vector.extract %6#5[0] : vector<2x2xf16> + %76 = vector.extract %6#5[0] : vector<2xf16> from vector<2x2xf16> vector.store %76, %alloc[%71, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %77 = vector.extract %6#5[1] : vector<2x2xf16> + %77 = vector.extract %6#5[1] : vector<2xf16> from vector<2x2xf16> vector.store %77, %alloc[%73, %17] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %78 = vector.extract %6#4[0] : vector<2x2xf16> + %78 = vector.extract %6#4[0] : vector<2xf16> from vector<2x2xf16> vector.store %78, %alloc[%71, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %79 = vector.extract %6#4[1] : vector<2x2xf16> + %79 = vector.extract %6#4[1] : vector<2xf16> from vector<2x2xf16> vector.store %79, %alloc[%73, %20] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %80 = vector.extract %6#3[0] : vector<2x2xf16> + %80 = vector.extract %6#3[0] : vector<2xf16> from vector<2x2xf16> vector.store %80, %alloc[%71, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %81 = vector.extract %6#3[1] : vector<2x2xf16> + %81 = vector.extract %6#3[1] : vector<2xf16> from vector<2x2xf16> vector.store %81, %alloc[%73, %23] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %82 = vector.extract %6#2[0] : vector<2x2xf16> + %82 = vector.extract %6#2[0] : vector<2xf16> from vector<2x2xf16> vector.store %82, %alloc[%71, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %83 = vector.extract %6#2[1] : vector<2x2xf16> + %83 = vector.extract %6#2[1] : vector<2xf16> from vector<2x2xf16> vector.store %83, %alloc[%73, %26] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %84 = vector.extract %6#1[0] : vector<2x2xf16> + %84 = vector.extract %6#1[0] : vector<2xf16> from vector<2x2xf16> vector.store %84, %alloc[%71, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %85 = vector.extract %6#1[1] : vector<2x2xf16> + %85 = vector.extract %6#1[1] : vector<2xf16> from vector<2x2xf16> vector.store %85, %alloc[%73, %29] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %86 = vector.extract %6#0[0] : vector<2x2xf16> + %86 = vector.extract %6#0[0] : vector<2xf16> from vector<2x2xf16> vector.store %86, %alloc[%71, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> - %87 = vector.extract %6#0[1] : vector<2x2xf16> + %87 = vector.extract %6#0[1] : vector<2xf16> from vector<2x2xf16> vector.store %87, %alloc[%73, %32] : memref<128x256xf16, #gpu.address_space>, vector<2xf16> gpu.barrier %88 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 4 + s2 * 8 + s0 floordiv 32)>()[%0, %1, %2] @@ -1037,149 +1037,149 @@ func.func @nvidia_tenscore_schedule_f32() { scf.yield %657, %658, %659, %660, %661, %662, %663, %664, %665, %666, %667, %668, %669, %670, %671, %672, %673, %674, %675, %676, %677, %678, %679, %680, %681, %682, %683, %684, %685, %686, %687, %688 : vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32> } %211 = gpu.lane_id - %212 = vector.extract %210#31[0] : vector<2x2xf32> + %212 = vector.extract %210#31[0] : vector<2xf32> from vector<2x2xf32> %213 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 48)>()[%1, %211] %214 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 56)>()[%0, %211] vector.store %212, %alloc[%213, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %215 = vector.extract %210#31[1] : vector<2x2xf32> + %215 = vector.extract %210#31[1] : vector<2xf32> from vector<2x2xf32> %216 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 56)>()[%1, %211] vector.store %215, %alloc[%216, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %217 = vector.extract %210#30[0] : vector<2x2xf32> + %217 = vector.extract %210#30[0] : vector<2xf32> from vector<2x2xf32> %218 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 48)>()[%0, %211] vector.store %217, %alloc[%213, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %219 = vector.extract %210#30[1] : vector<2x2xf32> + %219 = vector.extract %210#30[1] : vector<2xf32> from vector<2x2xf32> vector.store %219, %alloc[%216, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %220 = vector.extract %210#29[0] : vector<2x2xf32> + %220 = vector.extract %210#29[0] : vector<2xf32> from vector<2x2xf32> %221 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 40)>()[%0, %211] vector.store %220, %alloc[%213, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %222 = vector.extract %210#29[1] : vector<2x2xf32> + %222 = vector.extract %210#29[1] : vector<2xf32> from vector<2x2xf32> vector.store %222, %alloc[%216, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %223 = vector.extract %210#28[0] : vector<2x2xf32> + %223 = vector.extract %210#28[0] : vector<2xf32> from vector<2x2xf32> %224 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 32)>()[%0, %211] vector.store %223, %alloc[%213, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %225 = vector.extract %210#28[1] : vector<2x2xf32> + %225 = vector.extract %210#28[1] : vector<2xf32> from vector<2x2xf32> vector.store %225, %alloc[%216, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %226 = vector.extract %210#27[0] : vector<2x2xf32> + %226 = vector.extract %210#27[0] : vector<2xf32> from vector<2x2xf32> %227 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 24)>()[%0, %211] vector.store %226, %alloc[%213, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %228 = vector.extract %210#27[1] : vector<2x2xf32> + %228 = vector.extract %210#27[1] : vector<2xf32> from vector<2x2xf32> vector.store %228, %alloc[%216, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %229 = vector.extract %210#26[0] : vector<2x2xf32> + %229 = vector.extract %210#26[0] : vector<2xf32> from vector<2x2xf32> %230 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 16)>()[%0, %211] vector.store %229, %alloc[%213, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %231 = vector.extract %210#26[1] : vector<2x2xf32> + %231 = vector.extract %210#26[1] : vector<2xf32> from vector<2x2xf32> vector.store %231, %alloc[%216, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %232 = vector.extract %210#25[0] : vector<2x2xf32> + %232 = vector.extract %210#25[0] : vector<2xf32> from vector<2x2xf32> %233 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64 + 8)>()[%0, %211] vector.store %232, %alloc[%213, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %234 = vector.extract %210#25[1] : vector<2x2xf32> + %234 = vector.extract %210#25[1] : vector<2xf32> from vector<2x2xf32> vector.store %234, %alloc[%216, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %235 = vector.extract %210#24[0] : vector<2x2xf32> + %235 = vector.extract %210#24[0] : vector<2xf32> from vector<2x2xf32> %236 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 - (s1 floordiv 4) * 8 + (s0 floordiv 32) * 64)>()[%0, %211] vector.store %235, %alloc[%213, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %237 = vector.extract %210#24[1] : vector<2x2xf32> + %237 = vector.extract %210#24[1] : vector<2xf32> from vector<2x2xf32> vector.store %237, %alloc[%216, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %238 = vector.extract %210#23[0] : vector<2x2xf32> + %238 = vector.extract %210#23[0] : vector<2xf32> from vector<2x2xf32> %239 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 32)>()[%1, %211] vector.store %238, %alloc[%239, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %240 = vector.extract %210#23[1] : vector<2x2xf32> + %240 = vector.extract %210#23[1] : vector<2xf32> from vector<2x2xf32> %241 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 40)>()[%1, %211] vector.store %240, %alloc[%241, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %242 = vector.extract %210#22[0] : vector<2x2xf32> + %242 = vector.extract %210#22[0] : vector<2xf32> from vector<2x2xf32> vector.store %242, %alloc[%239, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %243 = vector.extract %210#22[1] : vector<2x2xf32> + %243 = vector.extract %210#22[1] : vector<2xf32> from vector<2x2xf32> vector.store %243, %alloc[%241, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %244 = vector.extract %210#21[0] : vector<2x2xf32> + %244 = vector.extract %210#21[0] : vector<2xf32> from vector<2x2xf32> vector.store %244, %alloc[%239, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %245 = vector.extract %210#21[1] : vector<2x2xf32> + %245 = vector.extract %210#21[1] : vector<2xf32> from vector<2x2xf32> vector.store %245, %alloc[%241, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %246 = vector.extract %210#20[0] : vector<2x2xf32> + %246 = vector.extract %210#20[0] : vector<2xf32> from vector<2x2xf32> vector.store %246, %alloc[%239, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %247 = vector.extract %210#20[1] : vector<2x2xf32> + %247 = vector.extract %210#20[1] : vector<2xf32> from vector<2x2xf32> vector.store %247, %alloc[%241, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %248 = vector.extract %210#19[0] : vector<2x2xf32> + %248 = vector.extract %210#19[0] : vector<2xf32> from vector<2x2xf32> vector.store %248, %alloc[%239, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %249 = vector.extract %210#19[1] : vector<2x2xf32> + %249 = vector.extract %210#19[1] : vector<2xf32> from vector<2x2xf32> vector.store %249, %alloc[%241, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %250 = vector.extract %210#18[0] : vector<2x2xf32> + %250 = vector.extract %210#18[0] : vector<2xf32> from vector<2x2xf32> vector.store %250, %alloc[%239, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %251 = vector.extract %210#18[1] : vector<2x2xf32> + %251 = vector.extract %210#18[1] : vector<2xf32> from vector<2x2xf32> vector.store %251, %alloc[%241, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %252 = vector.extract %210#17[0] : vector<2x2xf32> + %252 = vector.extract %210#17[0] : vector<2xf32> from vector<2x2xf32> vector.store %252, %alloc[%239, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %253 = vector.extract %210#17[1] : vector<2x2xf32> + %253 = vector.extract %210#17[1] : vector<2xf32> from vector<2x2xf32> vector.store %253, %alloc[%241, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %254 = vector.extract %210#16[0] : vector<2x2xf32> + %254 = vector.extract %210#16[0] : vector<2xf32> from vector<2x2xf32> vector.store %254, %alloc[%239, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %255 = vector.extract %210#16[1] : vector<2x2xf32> + %255 = vector.extract %210#16[1] : vector<2xf32> from vector<2x2xf32> vector.store %255, %alloc[%241, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %256 = vector.extract %210#15[0] : vector<2x2xf32> + %256 = vector.extract %210#15[0] : vector<2xf32> from vector<2x2xf32> %257 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 16)>()[%1, %211] vector.store %256, %alloc[%257, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %258 = vector.extract %210#15[1] : vector<2x2xf32> + %258 = vector.extract %210#15[1] : vector<2xf32> from vector<2x2xf32> %259 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 24)>()[%1, %211] vector.store %258, %alloc[%259, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %260 = vector.extract %210#14[0] : vector<2x2xf32> + %260 = vector.extract %210#14[0] : vector<2xf32> from vector<2x2xf32> vector.store %260, %alloc[%257, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %261 = vector.extract %210#14[1] : vector<2x2xf32> + %261 = vector.extract %210#14[1] : vector<2xf32> from vector<2x2xf32> vector.store %261, %alloc[%259, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %262 = vector.extract %210#13[0] : vector<2x2xf32> + %262 = vector.extract %210#13[0] : vector<2xf32> from vector<2x2xf32> vector.store %262, %alloc[%257, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %263 = vector.extract %210#13[1] : vector<2x2xf32> + %263 = vector.extract %210#13[1] : vector<2xf32> from vector<2x2xf32> vector.store %263, %alloc[%259, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %264 = vector.extract %210#12[0] : vector<2x2xf32> + %264 = vector.extract %210#12[0] : vector<2xf32> from vector<2x2xf32> vector.store %264, %alloc[%257, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %265 = vector.extract %210#12[1] : vector<2x2xf32> + %265 = vector.extract %210#12[1] : vector<2xf32> from vector<2x2xf32> vector.store %265, %alloc[%259, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %266 = vector.extract %210#11[0] : vector<2x2xf32> + %266 = vector.extract %210#11[0] : vector<2xf32> from vector<2x2xf32> vector.store %266, %alloc[%257, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %267 = vector.extract %210#11[1] : vector<2x2xf32> + %267 = vector.extract %210#11[1] : vector<2xf32> from vector<2x2xf32> vector.store %267, %alloc[%259, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %268 = vector.extract %210#10[0] : vector<2x2xf32> + %268 = vector.extract %210#10[0] : vector<2xf32> from vector<2x2xf32> vector.store %268, %alloc[%257, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %269 = vector.extract %210#10[1] : vector<2x2xf32> + %269 = vector.extract %210#10[1] : vector<2xf32> from vector<2x2xf32> vector.store %269, %alloc[%259, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %270 = vector.extract %210#9[0] : vector<2x2xf32> + %270 = vector.extract %210#9[0] : vector<2xf32> from vector<2x2xf32> vector.store %270, %alloc[%257, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %271 = vector.extract %210#9[1] : vector<2x2xf32> + %271 = vector.extract %210#9[1] : vector<2xf32> from vector<2x2xf32> vector.store %271, %alloc[%259, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %272 = vector.extract %210#8[0] : vector<2x2xf32> + %272 = vector.extract %210#8[0] : vector<2xf32> from vector<2x2xf32> vector.store %272, %alloc[%257, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %273 = vector.extract %210#8[1] : vector<2x2xf32> + %273 = vector.extract %210#8[1] : vector<2xf32> from vector<2x2xf32> vector.store %273, %alloc[%259, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %274 = vector.extract %210#7[0] : vector<2x2xf32> + %274 = vector.extract %210#7[0] : vector<2xf32> from vector<2x2xf32> %275 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4)>()[%1, %211] vector.store %274, %alloc[%275, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %276 = vector.extract %210#7[1] : vector<2x2xf32> + %276 = vector.extract %210#7[1] : vector<2xf32> from vector<2x2xf32> %277 = affine.apply affine_map<()[s0, s1] -> (s0 * 64 + s1 floordiv 4 + 8)>()[%1, %211] vector.store %276, %alloc[%277, %214] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %278 = vector.extract %210#6[0] : vector<2x2xf32> + %278 = vector.extract %210#6[0] : vector<2xf32> from vector<2x2xf32> vector.store %278, %alloc[%275, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %279 = vector.extract %210#6[1] : vector<2x2xf32> + %279 = vector.extract %210#6[1] : vector<2xf32> from vector<2x2xf32> vector.store %279, %alloc[%277, %218] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %280 = vector.extract %210#5[0] : vector<2x2xf32> + %280 = vector.extract %210#5[0] : vector<2xf32> from vector<2x2xf32> vector.store %280, %alloc[%275, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %281 = vector.extract %210#5[1] : vector<2x2xf32> + %281 = vector.extract %210#5[1] : vector<2xf32> from vector<2x2xf32> vector.store %281, %alloc[%277, %221] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %282 = vector.extract %210#4[0] : vector<2x2xf32> + %282 = vector.extract %210#4[0] : vector<2xf32> from vector<2x2xf32> vector.store %282, %alloc[%275, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %283 = vector.extract %210#4[1] : vector<2x2xf32> + %283 = vector.extract %210#4[1] : vector<2xf32> from vector<2x2xf32> vector.store %283, %alloc[%277, %224] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %284 = vector.extract %210#3[0] : vector<2x2xf32> + %284 = vector.extract %210#3[0] : vector<2xf32> from vector<2x2xf32> vector.store %284, %alloc[%275, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %285 = vector.extract %210#3[1] : vector<2x2xf32> + %285 = vector.extract %210#3[1] : vector<2xf32> from vector<2x2xf32> vector.store %285, %alloc[%277, %227] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %286 = vector.extract %210#2[0] : vector<2x2xf32> + %286 = vector.extract %210#2[0] : vector<2xf32> from vector<2x2xf32> vector.store %286, %alloc[%275, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %287 = vector.extract %210#2[1] : vector<2x2xf32> + %287 = vector.extract %210#2[1] : vector<2xf32> from vector<2x2xf32> vector.store %287, %alloc[%277, %230] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %288 = vector.extract %210#1[0] : vector<2x2xf32> + %288 = vector.extract %210#1[0] : vector<2xf32> from vector<2x2xf32> vector.store %288, %alloc[%275, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %289 = vector.extract %210#1[1] : vector<2x2xf32> + %289 = vector.extract %210#1[1] : vector<2xf32> from vector<2x2xf32> vector.store %289, %alloc[%277, %233] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %290 = vector.extract %210#0[0] : vector<2x2xf32> + %290 = vector.extract %210#0[0] : vector<2xf32> from vector<2x2xf32> vector.store %290, %alloc[%275, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> - %291 = vector.extract %210#0[1] : vector<2x2xf32> + %291 = vector.extract %210#0[1] : vector<2xf32> from vector<2x2xf32> vector.store %291, %alloc[%277, %236] : memref<128x128xf32, #gpu.address_space>, vector<2xf32> gpu.barrier %292 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 2 + s2 * 4 + s0 floordiv 32)>()[%0, %1, %2] diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td index a937c80db9b2..4f96206cff87 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td @@ -387,7 +387,7 @@ def ForallToWorkgroupOp : Op f32 { %c10 = arith.constant 10 : index %0 = vector.broadcast %arg0 : f32 to vector<4xf32> %20 = scf.for %arg3 = %c0 to %c10 step %c1 iter_args(%arg4 = %0) -> (vector<4xf32>) { - %a = vector.extract %arg4[0] : vector<4xf32> + %a = vector.extract %arg4[0] : f32 from vector<4xf32> %c = arith.addf %a, %a : f32 %bc = vector.broadcast %c : f32 to vector<4xf32> scf.yield %bc : vector<4xf32> } - %21 = vector.extract %20[0] : vector<4xf32> + %21 = vector.extract %20[0] : f32 from vector<4xf32> return %21 : f32 } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir index e004a265b353..93e5e3925a64 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir @@ -15,7 +15,7 @@ transform.sequence failures(propagate) { // Step 1. Map to a single block by tiling with size 1 and fusing. %fusion_root_1, %fusion_group_1 = transform.iree.take_first %maybe_trailing_0, %combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - %grid_loop, %outer_tiled = transform.structured.tile_to_forall_op %fusion_root_1 tile_sizes [1] + %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -45,8 +45,8 @@ transform.sequence failures(propagate) { // =========================================================================== %fusion_group_22_full = transform.merge_handles %fused_2, %original_fill_2 : !transform.any_op - %block_loop_22, %fusion_root_22_tiled = - transform.structured.tile_to_forall_op %outer_tiled + %fusion_root_22_tiled, %block_loop_22 = + transform.structured.tile_using_forall %outer_tiled tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -54,8 +54,8 @@ transform.sequence failures(propagate) { %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2 : !transform.any_op - %block_loop_21, %fusion_root_21_tiled = - transform.structured.tile_to_forall_op %parallel_reduction_2 + %fusion_root_21_tiled, %block_loop_21 = + transform.structured.tile_using_forall %parallel_reduction_2 tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp index 4f3a163dc060..074cfb7a5f41 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp @@ -113,6 +113,8 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, if (failed(tilingResult)) { return failure(); } + auto forLoops = llvm::to_vector(llvm::map_range( + tilingResult->loops, [](Operation *op) { return cast(op); })); yieldedValuesToOrigValues.append(rootOp->result_begin(), rootOp->result_end()); // A map from untiled value to scf.for iter_arg. The iter_arg is used for DPS @@ -129,9 +131,9 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, tilingResult->tiledOps[0] = replacementTiledOp.value(); } } else if (auto dpsOp = dyn_cast(rootOp)) { - for (auto [init, iterArg] : - llvm::zip_equal(dpsOp.getDpsInits(), - tilingResult->loops.back().getRegionIterArgs())) { + for (auto [init, iterArg] : llvm::zip_equal( + dpsOp.getDpsInits(), + cast(forLoops.back()).getRegionIterArgs())) { mapToIterArg[init] = iterArg; } } @@ -174,8 +176,7 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, // Materialize the slice of the producer in place. std::optional fusedProducer = - tileAndFuseProducerOfSlice(rewriter, candidateSliceOp, - tilingResult->loops); + tileAndFuseProducerOfSlice(rewriter, candidateSliceOp, forLoops); if (!fusedProducer) continue; @@ -183,11 +184,10 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, // to be yielded from within the tiled loop. OpResult untiledProducer = fusedProducer->origProducer; if (llvm::any_of(untiledProducer.getUsers(), [&](Operation *user) { - return !isIgnoredUser(user, tilingResult->loops.front()); + return !isIgnoredUser(user, forLoops.front()); })) { yieldReplacementForFusedProducer(rewriter, candidateSliceOp, - fusedProducer.value(), - tilingResult->loops); + fusedProducer.value(), forLoops); yieldedValuesToOrigValues.push_back(untiledProducer); } @@ -198,7 +198,7 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, } } - scf::ForOp outermostLoop = tilingResult->loops.front(); + scf::ForOp outermostLoop = forLoops.front(); for (auto [index, origVal] : llvm::enumerate(yieldedValuesToOrigValues)) { Value replacement = outermostLoop.getResult(index); rewriter.replaceUsesWithIf(origVal, replacement, [&](OpOperand &use) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir index 4340736934c8..52394096682b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_iree_tile_to_forall.mlir @@ -52,8 +52,8 @@ transform.sequence failures(propagate) { %original_matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall, %matmul = - transform.structured.tile_to_forall_op %original_matmul num_threads [32] + %matmul, %forall = + transform.structured.tile_using_forall %original_matmul num_threads [32] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -113,7 +113,7 @@ hal.executable private @matmul_static_dispatch_0 { transform.sequence failures(propagate) { ^bb1(%variant_op: !transform.any_op): %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_op, %tiled_op = transform.structured.tile_to_forall_op %1 num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block, #gpu.block, #gpu.block]): (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [1, 1, 1](mapping = [#gpu.block, #gpu.block, #gpu.block]): (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> () } @@ -163,6 +163,6 @@ hal.executable private @matmul_static_dispatch_0 { transform.sequence failures(propagate) { ^bb1(%variant_op: !transform.any_op): %1 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_op, %tiled_op = transform.structured.tile_to_forall_op %1 num_threads [] tile_sizes [5, 3](mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %tiled_op, %forall_op = transform.structured.tile_using_forall %1 num_threads [] tile_sizes [5, 3](mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_op : (!transform.any_op) -> () } diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir index 36461fb7046c..e392166c3468 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_contract_to_arm_intrinsics.mlir @@ -5,14 +5,14 @@ // CHECK-SAME: %[[RHS:[^:[:space:]]+]] // CHECK-SAME: %[[ACC:[^:[:space:]]+]] // CHECK-DAG: %[[ZERO:.*]] = arith.constant dense<0> : vector<4x4xi8> -// CHECK-DAG: %[[ACC_ROW_0:.*]] = vector.extract %[[ACC]][0] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_1:.*]] = vector.extract %[[ACC]][1] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_2:.*]] = vector.extract %[[ACC]][2] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_3:.*]] = vector.extract %[[ACC]][3] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_4:.*]] = vector.extract %[[ACC]][4] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_5:.*]] = vector.extract %[[ACC]][5] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_6:.*]] = vector.extract %[[ACC]][6] : vector<8x8xi32> -// CHECK-DAG: %[[ACC_ROW_7:.*]] = vector.extract %[[ACC]][7] : vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_0:.*]] = vector.extract %[[ACC]][0] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_1:.*]] = vector.extract %[[ACC]][1] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_2:.*]] = vector.extract %[[ACC]][2] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_3:.*]] = vector.extract %[[ACC]][3] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_4:.*]] = vector.extract %[[ACC]][4] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_5:.*]] = vector.extract %[[ACC]][5] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_6:.*]] = vector.extract %[[ACC]][6] : vector<8xi32> from vector<8x8xi32> +// CHECK-DAG: %[[ACC_ROW_7:.*]] = vector.extract %[[ACC]][7] : vector<8xi32> from vector<8x8xi32> // CHECK-DAG: %[[ACC_CHUNK_00:.*]] = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [0] // CHECK-DAG: %[[ACC_CHUNK_01:.*]] = vector.extract_strided_slice %[[ACC_ROW_0]] {offsets = [4] // CHECK-DAG: %[[ACC_CHUNK_02:.*]] = vector.extract_strided_slice %[[ACC_ROW_1]] {offsets = [0] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir index 6d20f3b30842..03b0caa47f19 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir @@ -38,15 +38,15 @@ transform.sequence failures(propagate) { // Tile and distribute to workgroups // ========================================== - %forall_grid, %tiled_attention = - transform.structured.tile_to_forall_op %attention tile_sizes [1, 128] + %tiled_attention, %forall_grid = + transform.structured.tile_using_forall %attention tile_sizes [1, 128] ( mapping = [#gpu.block, #gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () // Tile batch dimensions of attention // ========================================== %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %batch_tiled_attn, %loop = transform.structured.tile %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %top_level_func { transform.apply_patterns.canonicalization @@ -76,7 +76,7 @@ transform.sequence failures(propagate) { // Tile and fuse attention ops // ========================================== - %forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -101,7 +101,7 @@ transform.sequence failures(propagate) { // Distribute fills and last truncate // ========================================== %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op - %fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // Vectorize function // ========================================== diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir index 969991d88a89..689e1d24f158 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir @@ -109,19 +109,19 @@ builtin.module { // CHECK: %[[D54:.+]] = vector.broadcast %[[D53]] : f16 to vector<1xf16> // CHECK: %[[D55:.+]] = vector.insert_strided_slice %[[D54]], %[[D52]] {offsets = [0, 0, 1, 1], strides = [1]} : // CHECK-SAME: vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D56:.+]] = vector.extract %[[CST]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D57:.+]] = vector.extract %[[D41]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D58:.+]] = vector.extract %[[D55]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D56:.+]] = vector.extract %[[CST]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D57:.+]] = vector.extract %[[D41]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D58:.+]] = vector.extract %[[D55]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D59:.+]] = nvgpu.mma.sync(%[[D57]], %[[D58]], %[[D56]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, // CHECK-SAME: vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D60:.+]] = vector.insert %[[D59]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D61:.+]] = vector.extract %[[D60]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D61:.+]] = vector.extract %[[D60]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D61]], %[[D2]][%[[D8]], %[[D9]]] : memref<16x8xf16> -// CHECK: %[[D62:.+]] = vector.extract %[[D60]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D62:.+]] = vector.extract %[[D60]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D62]], %[[D2]][%[[D8]], %[[D14]]] : memref<16x8xf16> -// CHECK: %[[D63:.+]] = vector.extract %[[D60]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D63:.+]] = vector.extract %[[D60]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D63]], %[[D2]][%[[D29]], %[[D9]]] : memref<16x8xf16> -// CHECK: %[[D64:.+]] = vector.extract %[[D60]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D64:.+]] = vector.extract %[[D60]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D64]], %[[D2]][%[[D29]], %[[D14]]] : memref<16x8xf16> // CHECK: return // CHECK: } @@ -242,20 +242,20 @@ builtin.module { // CHECK: %[[D54:.+]] = vector.broadcast %[[D53]] : f16 to vector<1xf16> // CHECK: %[[D55:.+]] = vector.insert_strided_slice %[[D54]], %[[D52]] {offsets = [0, 0, 1, 1], strides = [1]} : // CHECK-SAME: vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D56:.+]] = vector.extract %[[CST]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D57:.+]] = vector.extract %[[D41]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D58:.+]] = vector.extract %[[D55]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D56:.+]] = vector.extract %[[CST]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D57:.+]] = vector.extract %[[D41]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D58:.+]] = vector.extract %[[D55]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D59:.+]] = nvgpu.mma.sync(%[[D57]], %[[D58]], %[[D56]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, // CHECK-SAME: vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D60:.+]] = vector.insert %[[D59]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D61:.+]] = vector.extract %[[CST_0]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D61:.+]] = vector.extract %[[CST_0]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK-DAG: %[[CST_2:.+]] = arith.constant dense<0.000000e+00> : vector<2xf16> -// CHECK: %[[D62:.+]] = vector.extract %[[D60]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D62:.+]] = vector.extract %[[D60]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D63:.+]] = vector.insert %[[D62]], %[[CST_2]] [0] : f16 into vector<2xf16> -// CHECK: %[[D64:.+]] = vector.extract %[[D60]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D64:.+]] = vector.extract %[[D60]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D65:.+]] = vector.insert %[[D64]], %[[D63]] [1] : f16 into vector<2xf16> // CHECK: %[[D66:.+]] = vector.bitcast %[[D65]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D67:.+]] = vector.extract %[[D66]][0] : vector<1xi32> +// CHECK: %[[D67:.+]] = vector.extract %[[D66]][0] : i32 from vector<1xi32> // CHECK-DAG: %[[C1_I32:.+]] = arith.constant 1 : i32 // CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32 // CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D67]], %[[C1_I32]], %[[C32_I32]] : i32 @@ -263,48 +263,48 @@ builtin.module { // CHECK: %[[D69:.+]] = vector.bitcast %[[D68]] : vector<1xi32> to vector<2xf16> // CHECK: %[[D70:.+]] = arith.maximumf %[[D69]], %[[D65]] : vector<2xf16> // CHECK: %[[D71:.+]] = vector.bitcast %[[D70]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D72:.+]] = vector.extract %[[D71]][0] : vector<1xi32> +// CHECK: %[[D72:.+]] = vector.extract %[[D71]][0] : i32 from vector<1xi32> // CHECK-DAG: %[[C2_I32:.+]] = arith.constant 2 : i32 // CHECK: %[[SHUFFLERESULT_3:.+]], %[[VALID_4:.+]] = gpu.shuffle xor %[[D72]], %[[C2_I32]], %[[C32_I32]] : i32 // CHECK: %[[D73:.+]] = vector.broadcast %[[SHUFFLERESULT_3]] : i32 to vector<1xi32> // CHECK: %[[D74:.+]] = vector.bitcast %[[D73]] : vector<1xi32> to vector<2xf16> // CHECK: %[[D75:.+]] = arith.maximumf %[[D74]], %[[D70]] : vector<2xf16> -// CHECK: %[[D76:.+]] = vector.extract %[[D75]][0] : vector<2xf16> -// CHECK: %[[D77:.+]] = vector.extract %[[D75]][1] : vector<2xf16> +// CHECK: %[[D76:.+]] = vector.extract %[[D75]][0] : f16 from vector<2xf16> +// CHECK: %[[D77:.+]] = vector.extract %[[D75]][1] : f16 from vector<2xf16> // CHECK: %[[D78:.+]] = arith.maximumf %[[D76]], %[[D77]] : f16 // CHECK: %[[D79:.+]] = arith.maximumf %[[D78]], %[[D61]] : f16 // CHECK: %[[D80:.+]] = vector.insert %[[D79]], %[[CST]] [0, 0, 0, 0] : f16 into vector<1x1x2x2xf16> // CHECK: %[[D81:.+]] = vector.insert %[[D79]], %[[D80]] [0, 0, 0, 1] : f16 into vector<1x1x2x2xf16> -// CHECK: %[[D82:.+]] = vector.extract %[[CST_0]][0, 0, 1, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D83:.+]] = vector.extract %[[D60]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D82:.+]] = vector.extract %[[CST_0]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> +// CHECK: %[[D83:.+]] = vector.extract %[[D60]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D84:.+]] = vector.insert %[[D83]], %[[CST_2]] [0] : f16 into vector<2xf16> -// CHECK: %[[D85:.+]] = vector.extract %[[D60]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D85:.+]] = vector.extract %[[D60]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D86:.+]] = vector.insert %[[D85]], %[[D84]] [1] : f16 into vector<2xf16> // CHECK: %[[D87:.+]] = vector.bitcast %[[D86]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D88:.+]] = vector.extract %[[D87]][0] : vector<1xi32> +// CHECK: %[[D88:.+]] = vector.extract %[[D87]][0] : i32 from vector<1xi32> // CHECK: %[[SHUFFLERESULT_5:.+]], %[[VALID_6:.+]] = gpu.shuffle xor %[[D88]], %[[C1_I32]], %[[C32_I32]] : i32 // CHECK: %[[D89:.+]] = vector.broadcast %[[SHUFFLERESULT_5]] : i32 to vector<1xi32> // CHECK: %[[D90:.+]] = vector.bitcast %[[D89]] : vector<1xi32> to vector<2xf16> // CHECK: %[[D91:.+]] = arith.maximumf %[[D90]], %[[D86]] : vector<2xf16> // CHECK: %[[D92:.+]] = vector.bitcast %[[D91]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D93:.+]] = vector.extract %[[D92]][0] : vector<1xi32> +// CHECK: %[[D93:.+]] = vector.extract %[[D92]][0] : i32 from vector<1xi32> // CHECK: %[[SHUFFLERESULT_7:.+]], %[[VALID_8:.+]] = gpu.shuffle xor %[[D93]], %[[C2_I32]], %[[C32_I32]] : i32 // CHECK: %[[D94:.+]] = vector.broadcast %[[SHUFFLERESULT_7]] : i32 to vector<1xi32> // CHECK: %[[D95:.+]] = vector.bitcast %[[D94]] : vector<1xi32> to vector<2xf16> // CHECK: %[[D96:.+]] = arith.maximumf %[[D95]], %[[D91]] : vector<2xf16> -// CHECK: %[[D97:.+]] = vector.extract %[[D96]][0] : vector<2xf16> -// CHECK: %[[D98:.+]] = vector.extract %[[D96]][1] : vector<2xf16> +// CHECK: %[[D97:.+]] = vector.extract %[[D96]][0] : f16 from vector<2xf16> +// CHECK: %[[D98:.+]] = vector.extract %[[D96]][1] : f16 from vector<2xf16> // CHECK: %[[D99:.+]] = arith.maximumf %[[D97]], %[[D98]] : f16 // CHECK: %[[D100:.+]] = arith.maximumf %[[D99]], %[[D82]] : f16 // CHECK: %[[D101:.+]] = vector.insert %[[D100]], %[[D81]] [0, 0, 1, 0] : f16 into vector<1x1x2x2xf16> // CHECK: %[[D102:.+]] = vector.insert %[[D100]], %[[D101]] [0, 0, 1, 1] : f16 into vector<1x1x2x2xf16> -// CHECK: %[[D103:.+]] = vector.extract %[[D102]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D103:.+]] = vector.extract %[[D102]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D103]], %[[D2]][%[[D8]], %[[D9]]] : memref<16x8xf16> -// CHECK: %[[D104:.+]] = vector.extract %[[D102]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D104:.+]] = vector.extract %[[D102]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D104]], %[[D2]][%[[D8]], %[[D14]]] : memref<16x8xf16> -// CHECK: %[[D105:.+]] = vector.extract %[[D102]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D105:.+]] = vector.extract %[[D102]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D105]], %[[D2]][%[[D29]], %[[D9]]] : memref<16x8xf16> -// CHECK: %[[D106:.+]] = vector.extract %[[D102]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D106:.+]] = vector.extract %[[D102]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D106]], %[[D2]][%[[D29]], %[[D14]]] : memref<16x8xf16> // CHECK: return // CHECK: } @@ -477,21 +477,21 @@ builtin.module { // CHECK: %[[D80:.+]] = vector.broadcast %[[D79]] : f16 to vector<1xf16> // CHECK: %[[D81:.+]] = vector.insert_strided_slice %[[D80]], %[[D78]] {offsets = [0, 0, 1, 1], strides = [1]} // CHECK-SAME: : vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D82:.+]] = vector.extract %[[ARG2]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D83:.+]] = vector.extract %[[D67]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D84:.+]] = vector.extract %[[D81]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D82:.+]] = vector.extract %[[ARG2]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D83:.+]] = vector.extract %[[D67]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D84:.+]] = vector.extract %[[D81]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D85:.+]] = nvgpu.mma.sync(%[[D83]], %[[D84]], %[[D82]]) {mmaShape = [16, 8, 16]} : // CHECK-SAME: (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D86:.+]] = vector.insert %[[D85]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> // CHECK: scf.yield %[[CST_0]], %[[D86]] : vector<16x8xf16>, vector<1x1x2x2xf16> // CHECK: } -// CHECK: %[[D30:.+]] = vector.extract %[[D29]]#[[D1:.+]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D30:.+]] = vector.extract %[[D29]]#[[D1:.+]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D30]], %[[D3]][%[[D11]], %[[D12]]] : memref<16x8xf16> -// CHECK: %[[D31:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D31:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D31]], %[[D3]][%[[D11]], %[[D17]]] : memref<16x8xf16> -// CHECK: %[[D32:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D32:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D32]], %[[D3]][%[[D22]], %[[D12]]] : memref<16x8xf16> -// CHECK: %[[D33:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D33:.+]] = vector.extract %[[D29]]#[[D1]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D33]], %[[D3]][%[[D22]], %[[D17]]] : memref<16x8xf16> // CHECK: return // CHECK: } @@ -643,9 +643,9 @@ builtin.module { // CHECK: %[[D75:.+]] = vector.broadcast %[[D74]] : f16 to vector<1xf16> // CHECK: %[[D76:.+]] = vector.insert_strided_slice %[[D75]], %[[D73]] {offsets = [0, 0, 1, 1], strides = [1]} // CHECK-SAME: : vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D77:.+]] = vector.extract %[[ARG2]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D78:.+]] = vector.extract %[[D62]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D79:.+]] = vector.extract %[[D76]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D77:.+]] = vector.extract %[[ARG2]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D78:.+]] = vector.extract %[[D62]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D79:.+]] = vector.extract %[[D76]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D80:.+]] = nvgpu.mma.sync(%[[D78]], %[[D79]], %[[D77]]) {mmaShape = [16, 8, 16]} : // CHECK-SAME: (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D81:.+]] = vector.insert %[[D80]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> @@ -655,21 +655,21 @@ builtin.module { // CHECK-DAG: %[[D7:.+]] = gpu.thread_id x // CHECK-DAG: %[[D8:.+]] = gpu.thread_id y // CHECK-DAG: %[[D9:.+]] = gpu.thread_id z -// CHECK: %[[D10:.+]] = vector.extract %[[D5]]#[[D1:.+]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D10:.+]] = vector.extract %[[D5]]#[[D1:.+]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK-DAG: %[[D11:.+]] = affine.apply #[[MAP3]](%[[D7]], %[[D8]], %[[D9]]) // CHECK-DAG: %[[D12:.+]] = affine.apply #[[MAP4]](%[[D7]], %[[D8]], %[[D9]]) // CHECK: %[[D13:.+]] = arith.addi %[[D11]], %[[D6]] : index // CHECK: %[[D14:.+]] = arith.addi %[[D12]], %[[C0]] : index // CHECK: memref.store %[[D10]], %[[D3]][%[[D13]], %[[D14]]] : memref<16x8xf16> -// CHECK: %[[D15:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D15:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK-DAG: %[[D16:.+]] = affine.apply #[[MAP5]](%[[D7]], %[[D8]], %[[D9]]) // CHECK: %[[D17:.+]] = arith.addi %[[D16]], %[[C0]] : index // CHECK: memref.store %[[D15]], %[[D3]][%[[D13]], %[[D17]]] : memref<16x8xf16> -// CHECK: %[[D18:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D18:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK-DAG: %[[D19:.+]] = affine.apply #[[MAP8]](%[[D7]], %[[D8]], %[[D9]]) // CHECK: %[[D20:.+]] = arith.addi %[[D19]], %[[D6]] : index // CHECK: memref.store %[[D18]], %[[D3]][%[[D20]], %[[D14]]] : memref<16x8xf16> -// CHECK: %[[D21:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D21:.+]] = vector.extract %[[D5]]#[[D1]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D21]], %[[D3]][%[[D20]], %[[D17]]] : memref<16x8xf16> // CHECK: return // CHECK: } @@ -797,9 +797,9 @@ builtin.module { // CHECK: %[[D55:.+]] = vector.broadcast %[[D54]] : f16 to vector<1xf16> // CHECK: %[[D56:.+]] = vector.insert_strided_slice %[[D55]], %[[D53]] {offsets = [0, 0, 1, 1], strides = [1]} : // CHECK-SAME: vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D57:.+]] = vector.extract %[[CST]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D58:.+]] = vector.extract %[[D42]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D59:.+]] = vector.extract %[[D56]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D57:.+]] = vector.extract %[[CST]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D58:.+]] = vector.extract %[[D42]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D59:.+]] = vector.extract %[[D56]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D60:.+]] = nvgpu.mma.sync(%[[D58]], %[[D59]], %[[D57]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, // CHECK-SAME: vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D61:.+]] = vector.insert %[[D60]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> @@ -823,13 +823,13 @@ builtin.module { // CHECK-SAME: vector<1xf16> into vector<1x1x2x2xf16> // CHECK: %[[D74:.+]] = arith.subf %[[D61]], %[[D73]] : vector<1x1x2x2xf16> // CHECK: %[[D75:.+]] = math.exp %[[D74]] : vector<1x1x2x2xf16> -// CHECK: %[[D76:.+]] = vector.extract %[[D75]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D76:.+]] = vector.extract %[[D75]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D76]], %[[SUBVIEW]][%[[D9]], %[[D10]]] : memref<16x8xf16, strided<[8, 1], offset: ?>> -// CHECK: %[[D77:.+]] = vector.extract %[[D75]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D77:.+]] = vector.extract %[[D75]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D77]], %[[SUBVIEW]][%[[D9]], %[[D15]]] : memref<16x8xf16, strided<[8, 1], offset: ?>> -// CHECK: %[[D78:.+]] = vector.extract %[[D75]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D78:.+]] = vector.extract %[[D75]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D78]], %[[SUBVIEW]][%[[D30]], %[[D10]]] : memref<16x8xf16, strided<[8, 1], offset: ?>> -// CHECK: %[[D79:.+]] = vector.extract %[[D75]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D79:.+]] = vector.extract %[[D75]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D79]], %[[SUBVIEW]][%[[D30]], %[[D15]]] : memref<16x8xf16, strided<[8, 1], offset: ?>> // CHECK: return // CHECK: } @@ -902,8 +902,8 @@ builtin.module { // CHECK: %[[OFF11:.+]] = arith.addi %[[LANEOFF2]], %[[OFF4]] : index // CHECK: %[[LD4:.+]] = nvgpu.ldmatrix %{{.*}}[%[[OFF11]], %[[OFF9]]] {numTiles = 1 : i32, transpose = true} : memref<8x16xf16, #gpu.address_space> -> vector<1x2xf16> // CHECK: %[[V5:.+]] = vector.insert_strided_slice %[[LD4]], %[[V4]] {offsets = [0, 0, 1, 0], strides = [1, 1]} : vector<1x2xf16> into vector<1x1x2x2xf16> -// CHECK: %[[A:.+]] = vector.extract %[[V3]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[B:.+]] = vector.extract %[[V5]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[A:.+]] = vector.extract %[[V3]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[B:.+]] = vector.extract %[[V5]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: nvgpu.mma.sync(%[[A]], %[[B]], %{{.*}}) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // ----- @@ -1218,14 +1218,14 @@ builtin.module { // CHECK: %[[D70:.+]] = vector.broadcast %[[D69]] : f16 to vector<1xf16> // CHECK: %[[D71:.+]] = vector.insert_strided_slice %[[D70]], %[[D68]] {offsets = [1, 0, 1, 1], strides = [1]} : // CHECK-SAME: vector<1xf16> into vector<2x1x2x2xf16> -// CHECK: %[[D72:.+]] = vector.extract %[[CST]][0, 0] : vector<1x2x2x2xf16> -// CHECK: %[[D73:.+]] = vector.extract %[[D43]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D74:.+]] = vector.extract %[[D71]][0, 0] : vector<2x1x2x2xf16> +// CHECK: %[[D72:.+]] = vector.extract %[[CST]][0, 0] : vector<2x2xf16> from vector<1x2x2x2xf16> +// CHECK: %[[D73:.+]] = vector.extract %[[D43]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D74:.+]] = vector.extract %[[D71]][0, 0] : vector<2x2xf16> from vector<2x1x2x2xf16> // CHECK: %[[D75:.+]] = nvgpu.mma.sync(%[[D73]], %[[D74]], %[[D72]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, // CHECK-SAME: vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D76:.+]] = vector.insert %[[D75]], %[[CST]] [0, 0] : vector<2x2xf16> into vector<1x2x2x2xf16> -// CHECK: %[[D77:.+]] = vector.extract %[[CST]][0, 1] : vector<1x2x2x2xf16> -// CHECK: %[[D78:.+]] = vector.extract %[[D71]][1, 0] : vector<2x1x2x2xf16> +// CHECK: %[[D77:.+]] = vector.extract %[[CST]][0, 1] : vector<2x2xf16> from vector<1x2x2x2xf16> +// CHECK: %[[D78:.+]] = vector.extract %[[D71]][1, 0] : vector<2x2xf16> from vector<2x1x2x2xf16> // CHECK: %[[D79:.+]] = nvgpu.mma.sync(%[[D73]], %[[D78]], %[[D77]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, // CHECK-SAME: vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D80:.+]] = vector.insert %[[D79]], %[[D76]] [0, 1] : vector<2x2xf16> into vector<1x2x2x2xf16> @@ -1247,31 +1247,31 @@ builtin.module { // CHECK: %[[D91:.+]] = vector.broadcast %[[D90]] : f16 to vector<1xf16> // CHECK: %[[D92:.+]] = vector.insert_strided_slice %[[D91]], %[[D89]] {offsets = [0, 0, 1, 1], strides = [1]} : // CHECK-SAME: vector<1xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D93:.+]] = vector.extract %[[D80]][0, 0] : vector<1x2x2x2xf16> +// CHECK: %[[D93:.+]] = vector.extract %[[D80]][0, 0] : vector<2x2xf16> from vector<1x2x2x2xf16> // CHECK: %[[D94:.+]] = vector.insert_strided_slice %[[D93]], %[[CST_1]] {offsets = [0, 0, 0, 0], strides = [1, // CHECK-SAME: 1]} : vector<2x2xf16> into vector<1x1x4x2xf16> -// CHECK: %[[D95:.+]] = vector.extract %[[D80]][0, 1] : vector<1x2x2x2xf16> +// CHECK: %[[D95:.+]] = vector.extract %[[D80]][0, 1] : vector<2x2xf16> from vector<1x2x2x2xf16> // CHECK: %[[D96:.+]] = vector.insert_strided_slice %[[D95]], %[[D94]] {offsets = [0, 0, 2, 0], strides = [1, 1]} // CHECK-SAME: : vector<2x2xf16> into vector<1x1x4x2xf16> -// CHECK: %[[D97:.+]] = vector.extract %[[CST_0]][0, 0] : vector<1x1x2x2xf16> -// CHECK: %[[D98:.+]] = vector.extract %[[D96]][0, 0] : vector<1x1x4x2xf16> -// CHECK: %[[D99:.+]] = vector.extract %[[D92]][0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D97:.+]] = vector.extract %[[CST_0]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> +// CHECK: %[[D98:.+]] = vector.extract %[[D96]][0, 0] : vector<4x2xf16> from vector<1x1x4x2xf16> +// CHECK: %[[D99:.+]] = vector.extract %[[D92]][0, 0] : vector<2x2xf16> from vector<1x1x2x2xf16> // CHECK: %[[D100:.+]] = nvgpu.mma.sync(%[[D98]], %[[D99]], %[[D97]]) {mmaShape = [16, 8, 16]} : // CHECK-SAME: (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> // CHECK: %[[D101:.+]] = vector.insert %[[D100]], %[[CST_0]] [0, 0] : vector<2x2xf16> into vector<1x1x2x2xf16> -// CHECK: %[[D102:.+]] = vector.extract %[[D101]][0, 0, 0, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D102:.+]] = vector.extract %[[D101]][0, 0, 0, 0] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D103:.+]] = arith.addi %[[D8]], %[[C0]] : index // CHECK: memref.store %[[D102]], %[[SUBVIEW]][%[[D103]], %[[D11]]] : memref<16x8xf16, strided<[8, 1], offset: // CHECK-SAME: ?>> -// CHECK: %[[D104:.+]] = vector.extract %[[D101]][0, 0, 0, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D104:.+]] = vector.extract %[[D101]][0, 0, 0, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D104]], %[[SUBVIEW]][%[[D103]], %[[D16]]] : memref<16x8xf16, strided<[8, 1], offset: // CHECK-SAME: ?>> -// CHECK: %[[D105:.+]] = vector.extract %[[D101]][0, 0, 1, 0] : vector<1x1x2x2xf16> +// CHECK: %[[D105:.+]] = vector.extract %[[D101]][0, 0, 1, 0] : f16 from vector<1x1x2x2xf16> // CHECK: %[[D106:.+]] = arith.addi %[[D30]], %[[C0]] : index // CHECK: memref.store %[[D105]], %[[SUBVIEW]][%[[D106]], %[[D11]]] : memref<16x8xf16, strided<[8, 1], offset: // CHECK-SAME: ?>> -// CHECK: %[[D107:.+]] = vector.extract %[[D101]][0, 0, 1, 1] : vector<1x1x2x2xf16> +// CHECK: %[[D107:.+]] = vector.extract %[[D101]][0, 0, 1, 1] : f16 from vector<1x1x2x2xf16> // CHECK: memref.store %[[D107]], %[[SUBVIEW]][%[[D106]], %[[D16]]] : memref<16x8xf16, strided<[8, 1], offset: // CHECK-SAME: ?>> // CHECK: return -// CHECK: } \ No newline at end of file +// CHECK: } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/legalize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/legalize.mlir index a06a2c5a3745..7a8b82f616ca 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/legalize.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/legalize.mlir @@ -4,31 +4,31 @@ func.func @scalarize( %arg0: vector<3x1x2xf32>,%arg1: vector<2xf32>, %arg2: vector<2xf32>) -> (vector<3x1x2xf32>, vector<2xf32>) { -// CHECK: %[[E_0_0:.+]] = vector.extract %{{.*}}[0, 0, 0] : vector<3x1x2xf32> +// CHECK: %[[E_0_0:.+]] = vector.extract %{{.*}}[0, 0, 0] : f32 from vector<3x1x2xf32> // CHECK: %[[S_0_0:.+]] = math.sqrt %[[E_0_0]] : f32 // CHECK: vector.insert %[[S_0_0]], %{{.*}} [0, 0, 0] : f32 into vector<3x1x2xf32> -// CHECK: %[[E_1_0:.+]] = vector.extract %{{.*}}[1, 0, 0] : vector<3x1x2xf32> +// CHECK: %[[E_1_0:.+]] = vector.extract %{{.*}}[1, 0, 0] : f32 from vector<3x1x2xf32> // CHECK: %[[S_1_0:.+]] = math.sqrt %[[E_1_0]] : f32 // CHECK: vector.insert %[[S_1_0]], %{{.*}} [1, 0, 0] : f32 into vector<3x1x2xf32> -// CHECK: %[[E_2_0:.+]] = vector.extract %{{.*}}[2, 0, 0] : vector<3x1x2xf32> +// CHECK: %[[E_2_0:.+]] = vector.extract %{{.*}}[2, 0, 0] : f32 from vector<3x1x2xf32> // CHECK: %[[S_2_0:.+]] = math.sqrt %[[E_2_0]] : f32 // CHECK: vector.insert %[[S_2_0]], %{{.*}} [2, 0, 0] : f32 into vector<3x1x2xf32> -// CHECK: %[[E_0_1:.+]] = vector.extract %{{.*}}[0, 0, 1] : vector<3x1x2xf32> +// CHECK: %[[E_0_1:.+]] = vector.extract %{{.*}}[0, 0, 1] : f32 from vector<3x1x2xf32> // CHECK: %[[S_0_1:.+]] = math.sqrt %[[E_0_1]] : f32 // CHECK: vector.insert %[[S_0_1]], %{{.*}} [0, 0, 1] : f32 into vector<3x1x2xf32> -// CHECK: %[[E_1_1:.+]] = vector.extract %{{.*}}[1, 0, 1] : vector<3x1x2xf32> +// CHECK: %[[E_1_1:.+]] = vector.extract %{{.*}}[1, 0, 1] : f32 from vector<3x1x2xf32> // CHECK: %[[S_1_1:.+]] = math.sqrt %[[E_1_1]] : f32 // CHECK: vector.insert %[[S_1_1]], %{{.*}} [1, 0, 1] : f32 into vector<3x1x2xf32> -// CHECK: %[[E_2_1:.+]] = vector.extract %{{.*}}[2, 0, 1] : vector<3x1x2xf32> +// CHECK: %[[E_2_1:.+]] = vector.extract %{{.*}}[2, 0, 1] : f32 from vector<3x1x2xf32> // CHECK: %[[S_2_1:.+]] = math.sqrt %[[E_2_1]] : f32 // CHECK: vector.insert %[[S_2_1]], %{{.*}} [2, 0, 1] : f32 into vector<3x1x2xf32> %0 = math.sqrt %arg0 : vector<3x1x2xf32> -// CHECK: %[[E0:.+]] = vector.extract %{{.*}}[0] : vector<2xf32> -// CHECK: %[[E1:.+]] = vector.extract %{{.*}}[0] : vector<2xf32> +// CHECK: %[[E0:.+]] = vector.extract %{{.*}}[0] : f32 from vector<2xf32> +// CHECK: %[[E1:.+]] = vector.extract %{{.*}}[0] : f32 from vector<2xf32> // CHECK: %[[P0:.+]] = math.powf %[[E0]], %[[E1]] : f32 // CHECK: vector.insert %[[P0]], %{{.*}} [0] : f32 into vector<2xf32> -// CHECK: %[[E2:.+]] = vector.extract %{{.*}}[1] : vector<2xf32> -// CHECK: %[[E3:.+]] = vector.extract %{{.*}}[1] : vector<2xf32> +// CHECK: %[[E2:.+]] = vector.extract %{{.*}}[1] : f32 from vector<2xf32> +// CHECK: %[[E3:.+]] = vector.extract %{{.*}}[1] : f32 from vector<2xf32> // CHECK: %[[P1:.+]] = math.powf %[[E2]], %[[E3]] : f32 // CHECK: vector.insert %[[P1]], %{{.*}} [1] : f32 into vector<2xf32> %1 = math.powf %arg1, %arg2 : vector<2xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir index 3394639c1717..56bf6ad04bf1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir @@ -59,12 +59,12 @@ hal.executable.variant @cuda, target = <"cuda", "cuda-nvptx-fb"> { // CHECK: %[[TID4:.+]] = affine.apply #[[$MAP]]()[%[[TID]]] // CHECK: %[[R0:.+]] = scf.for %{{.*}} = %[[TID4]] to %[[C10240]] step %[[C1024]] iter_args(%[[A0:.+]] = %[[CST]]) -> (vector<1xf32>) { // CHECK: %[[V:.+]] = vector.transfer_read {{.*}} {in_bounds = [true]} : memref<512x10240xf32, #hal.descriptor_type>, vector<4xf32> -// CHECK: %[[E:.+]] = vector.extract %[[A0]][0] : vector<1xf32> +// CHECK: %[[E:.+]] = vector.extract %[[A0]][0] : f32 from vector<1xf32> // CHECK: %[[RL:.+]] = vector.reduction , %[[V]], %[[E]] : vector<4xf32> into f32 // CHECK: %[[B:.+]] = vector.broadcast %[[RL:.*]] : f32 to vector<1xf32> // CHECK: scf.yield %[[B]] : vector<1xf32> // CHECK: } -// CHECK: %[[R1:.+]] = vector.extract %[[R0]][0] : vector<1xf32> +// CHECK: %[[R1:.+]] = vector.extract %[[R0]][0] : f32 from vector<1xf32> // CHECK: %[[S0:.+]], %{{.*}} = gpu.shuffle xor %[[R1]], %[[C1]], %[[C32]] : f32 // CHECK: %[[R2:.+]] = arith.addf %[[R1]], %[[S0]] : f32 // CHECK: %[[S1:.+]], %{{.*}} = gpu.shuffle xor %[[R2]], %[[C2]], %[[C32]] : f32 diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir index 45fb4e9c4161..3e2e72f104aa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir @@ -84,7 +84,7 @@ module attributes {hal.device.targets = [#device_target_cuda]} { // CHECK: transform.sequence failures(propagate) { // CHECK: transform.iree.register_match_callbacks // CHECK: %[[MATCH:.+]]:2 = transform.iree.match_callback failures(propagate) "batch_matmul" -// CHECK: %[[FORALL:.+]], %[[TILED:.+]] = transform.structured.tile_to_forall_op %[[MATCH]]#1 +// CHECK: %[[TILED:.+]], %[[FORALL:.+]] = transform.structured.tile_using_forall %[[MATCH]]#1 // DEFAULT: num_threads [] tile_sizes [64, 64, 1](mapping = [#gpu.block, #gpu.block, #gpu.block]) // OPTIONS: num_threads [] tile_sizes [128, 64, 32](mapping = [#gpu.block, #gpu.block, #gpu.block]) // CHECK: apply_patterns @@ -92,7 +92,7 @@ module attributes {hal.device.targets = [#device_target_cuda]} { // CHECK: transform.iree.apply_cse // CHECK: %[[FUSED:.+]], %[[CONTAINING:.+]] = transform.structured.fuse_into_containing_op %[[MATCH]]#0 into %[[FORALL]] // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice %[[FORALL]] -// CHECK: %[[TILED_LINALG:.+]], %[[LOOPS:.+]] = transform.structured.tile %tiled_op +// CHECK: %[[TILED_LINALG:.+]], %[[LOOPS:.+]] = transform.structured.tile_using_for %tiled_op // DEFAULT: [0, 0, 0, 16] // OPTIONS: [0, 0, 0, 8] // CHECK: %[[PADDED:.+]], %{{.*}}, %{{.+}} = transform.structured.pad %tiled_linalg_op @@ -113,7 +113,7 @@ module attributes {hal.device.targets = [#device_target_cuda]} { // CHECK: %[[RHS:.+]] = get_producer_of_operand %[[PADDED]][1] // CHECK: %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]] -// CHECK: transform.structured.tile_to_forall_op %[[LHS]] +// CHECK: transform.structured.tile_using_forall %[[LHS]] // DEFAULT: num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // OPTIONS: num_threads [1, 64, 2] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // CHECK: apply_patterns @@ -122,28 +122,28 @@ module attributes {hal.device.targets = [#device_target_cuda]} { // CHECK: transform.structured.match ops{["scf.if"]} // CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch -// CHECK: transform.structured.tile_to_forall_op %[[RHS_DPS]] +// CHECK: transform.structured.tile_using_forall %[[RHS_DPS]] // DEFAULT: num_threads [8, 16, 1] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // OPTIONS: num_threads [2, 8, 8] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // CHECK: apply_patterns // CHECK: transform.iree.apply_licm // CHECK: transform.iree.apply_cse -// CHECK: transform.structured.tile_to_forall_op +// CHECK: transform.structured.tile_using_forall // DEFAULT: num_threads [2, 64, 1] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // OPTIONS: num_threads [1, 16, 8] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // CHECK: apply_patterns // CHECK: transform.iree.apply_licm // CHECK: transform.iree.apply_cse -// CHECK: transform.structured.tile_to_forall_op +// CHECK: transform.structured.tile_using_forall // DEFAULT: num_threads [1, 2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // OPTIONS: num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // CHECK: apply_patterns // CHECK: transform.iree.apply_licm // CHECK: transform.iree.apply_cse -// CHECK: %forall_op_8, %tiled_op_9 = transform.structured.tile_to_forall_op %[[FILL]] +// CHECK: %tiled_op_8, %forall_op_9 = transform.structured.tile_using_forall %[[FILL]] // DEFAULT: num_threads [1, 2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // OPTIONS: num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) // CHECK: apply_patterns diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir index 440f93b379b9..b4611e9e1a57 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir @@ -34,13 +34,13 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: transform.structured.convert_conv2d_to_img2col // CHECK: get_producer_of_operand %{{.*}}[0] // CHECK: transform.apply_patterns.iree.bubble_collapse -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [] tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [] tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) // CHECK: transform.structured.fuse_into_containing_op // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice %{{.*}} // CHECK: transform.structured.match ops{["linalg.fill"]} // CHECK: transform.structured.fuse_into_containing_op // CHECK: transform.structured.fuse_into_containing_op -// CHECK: transform.structured.tile %{{.*}}[0, 0, 0, 16] +// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 0, 16] // CHECK: transform.structured.fuse_into_containing_op // CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [1, 0, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} // CHECK: transform.structured.match ops{["linalg.fill"]} @@ -49,10 +49,10 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0] // CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1] // CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]] -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %[[RHS]] num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %[[RHS]] num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) // CHECK: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface // CHECK: transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices // CHECK: transform.apply_patterns.vector.cast_away_vector_leading_one_dim @@ -101,17 +101,17 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK-LABEL: func @nhwc_convolution // CHECK: transform.sequence failures(propagate) { -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [] tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [] tile_sizes [1, 128, 128](mapping = [#gpu.block, #gpu.block, #gpu.block]) // CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [0, 1, 1], pad_to_multiple_of = [1, 1, 1, 1], padding_dimensions = [0, 1, 2, 3], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} // CHECK: %[[RES:.+]] = get_producer_of_operand %{{.*}}[2] // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RES]] // CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0] // CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1] // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]] -// CHECK: transform.structured.tile_to_forall_op %[[LHS]] num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %[[LHS]] num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp, #gpu.warp]) // CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir index 0bdf1b534374..c80f916a8d12 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir @@ -69,20 +69,20 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: transform.sequence failures(propagate) { // CHECK: transform.iree.match_callback failures(propagate) "matmul" -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [] tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [] tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) // CHECK: transform.structured.fuse_into_containing_op // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice -// CHECK: transform.structured.tile %{{.*}}[0, 0, 16] +// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16] // CHECK: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} // CHECK: transform.structured.hoist_pad %{{.}} by 1 loops // CHECK: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) // CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] // CHECK: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] // CHECK: transform.structured.vectorize %{{.*}} vector_sizes [32, 4] @@ -130,21 +130,21 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // WITH_OPTIONS: transform.sequence failures(propagate) { // WITH_OPTIONS: transform.iree.match_callback failures(propagate) "matmul" // Tile sizes are set by td-matmul-strategy-blk-size-XX. -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [] tile_sizes [256, 64](mapping = [#gpu.block, #gpu.block]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [] tile_sizes [256, 64](mapping = [#gpu.block, #gpu.block]) // WITH_OPTIONS: transform.structured.fuse_into_containing_op // WITH_OPTIONS: transform.iree.populate_workgroup_count_region_using_num_threads_slice // The tiling is affected by td-matmul-strategy-reduc-size: 8. -// WITH_OPTIONS: transform.structured.tile %{{.*}}[0, 0, 8] +// WITH_OPTIONS: transform.structured.tile_using_for %{{.*}}[0, 0, 8] // WITH_OPTIONS: transform.structured.pad %{{.*}} {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} // WITH_OPTIONS: transform.structured.hoist_pad %{{.}} by 1 loops // WITH_OPTIONS: transform.structured.insert_slice_to_copy %{{.*}} : (!transform.any_op) -> !transform.any_op -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [64, 2] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [64, 2] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // WITH_OPTIONS: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // WITH_OPTIONS: transform.scf.take_assumed_branch %{{.*}} take_else_branch : (!transform.any_op) -> () -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 1] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) -// WITH_OPTIONS: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 1] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [4, 1] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// WITH_OPTIONS: transform.structured.tile_using_forall %{{.*}} num_threads [4, 1] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) // WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [4, 4] // WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [1, 4] // WITH_OPTIONS: transform.structured.vectorize %{{.*}} vector_sizes [32, 4] @@ -235,17 +235,17 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: transform.sequence failures(propagate) { // CHECK: transform.iree.match_callback failures(propagate) "matmul" -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [] tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [] tile_sizes [128, 128](mapping = [#gpu.block, #gpu.block]) // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice -// CHECK: transform.structured.tile %{{.*}}[0, 0, 16] +// CHECK: transform.structured.tile_using_for %{{.*}}[0, 0, 16] // align1 -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [8, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // align2 -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // align2 -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 64] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) // align1 // CHECK: transform.structured.vectorize %{{.*}} vector_sizes [16, 1] // align2 @@ -322,7 +322,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK-LABEL: func @matmul_4_partially_unaligned -// CHECK: transform.structured.tile %tiled_op[0, 0, 16] +// CHECK: transform.structured.tile_using_for %tiled_op[0, 0, 16] // Make sure we do not canonicalize because the result is still aligned. // CHECK-NEXT: transform.structured.pad %tiled_linalg_op @@ -339,14 +339,14 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: %[[RES_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RES_PAD]] // CHECK: %[[LHS_PAD:.+]] = get_producer_of_operand %{{.*}}[0] // CHECK: %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1] -// CHECK: %{{.*}}, %[[TILED_LHS:.+]] = transform.structured.tile_to_forall_op %[[LHS_PAD]] num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: %[[TILED_LHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[LHS_PAD]] num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // CHECK: transform.structured.match ops{["scf.if"]} // CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch -// CHECK: %{{.*}}, %[[TILED_RHS:.+]] = transform.structured.tile_to_forall_op %[[RHS_PAD]] num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: %[[TILED_RHS:.+]], %{{.*}} = transform.structured.tile_using_forall %[[RHS_PAD]] num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // CHECK: transform.structured.match ops{["scf.if"]} // CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) // CHECK: transform.apply_patterns.canonicalization // CHECK } // CHECK: transform.iree.apply_licm @@ -394,7 +394,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK-LABEL: func @aligned_matmul // Block level is the same for aligned. -// CHECK: transform.structured.tile %tiled_op[0, 0, 16] +// CHECK: transform.structured.tile_using_for %tiled_op[0, 0, 16] // Make sure we do not canonicalize if the result is aligned to avoid folding the extract_slice on the iterator. // CHECK-NEXT: transform.structured.pad %tiled_linalg_op @@ -415,10 +415,10 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: %[[RHS_PAD:.+]] = get_producer_of_operand %{{.*}}[1] // CHECK: %[[LHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[LHS_PAD]] // CHECK: %[[RHS_COPY:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS_PAD]] -// CHECK: transform.structured.tile_to_forall_op %[[LHS_COPY]] num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %[[RHS_COPY]] num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) -// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %[[LHS_COPY]] num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %[[RHS_COPY]] num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) +// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [2, 2] tile_sizes [](mapping = [#gpu.warp, #gpu.warp]) // CHECK: transform.apply_patterns.canonicalization // CHECK } // CHECK: transform.iree.apply_licm diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir index 49afab70ce19..2b444efc4ad0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir @@ -47,7 +47,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: transform.sequence failures(propagate) { // CHECK: transform.iree.register_match_callbacks // CHECK: {{.*}} = transform.iree.match_callback failures(propagate) "pad"({{.*}}) : (!transform.any_op) -> !transform.any_op -// CHECK: transform.structured.tile_to_forall_op {{.*}} num_threads [] tile_sizes [64, 64](mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +// CHECK: transform.structured.tile_using_forall {{.*}} num_threads [] tile_sizes [64, 64](mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // CHECK: apply_patterns to %{{.*}} { // CHECK: transform.apply_patterns.canonicalization // CHECK } @@ -56,7 +56,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: {{.*}} = transform.structured.match ops{["scf.if"]} in {{.*}} : (!transform.any_op) -> !transform.any_op // CHECK: transform.scf.take_assumed_branch {{.*}} take_else_branch : (!transform.any_op) -> () // CHECK: transform.iree.populate_workgroup_count_region_using_num_threads_slice {{.*}} : (!transform.any_op) -> () -// CHECK: {{.*}} = transform.structured.tile_to_forall_op {{.*}} num_threads [16, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) +// CHECK: {{.*}} = transform.structured.tile_using_forall {{.*}} num_threads [16, 16] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // CHECK: apply_patterns to %{{.*}} { // CHECK: transform.apply_patterns.canonicalization // CHECK } @@ -96,8 +96,8 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb", // CHECK: transform.iree.apply_cse // WITH_OPTIONS-LABEL: func @pad -// WITH_OPTIONS: transform.structured.tile_to_forall_op {{.*}} num_threads [] tile_sizes [32, 16](mapping = [#gpu.block, #gpu.block]) -// WITH_OPTIONS: {{.*}} = transform.structured.tile_to_forall_op {{.*}} num_threads [4, 8] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) +// WITH_OPTIONS: transform.structured.tile_using_forall {{.*}} num_threads [] tile_sizes [32, 16](mapping = [#gpu.block, #gpu.block]) +// WITH_OPTIONS: {{.*}} = transform.structured.tile_using_forall {{.*}} num_threads [4, 8] tile_sizes [](mapping = [#gpu.thread, #gpu.thread]) // WITH_OPTIONS: transform.structured.vectorize {{.*}} vector_sizes [2, 4] : !transform.any_op // WITH_OPTIONS: transform.iree.map_nested_forall_to_gpu_threads {{.*}} workgroup_dims = [8, 4, 1] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir index fb88302c780e..d3df044eb67b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir @@ -1,12 +1,12 @@ transform.sequence failures(propagate) { ^bb1(%variant_op: !transform.any_op): %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall, %tiled_fill = transform.structured.tile_to_forall_op %0 num_threads [5, 1] + %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %1 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_2, %tiled_matmul = transform.structured.tile_to_forall_op %1 num_threads [7, 9] + %forall_2, %tiled_matmul = transform.structured.tile_using_forall %1 num_threads [7, 9] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir index 5d07097291fe..1558636d6b4a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir @@ -198,7 +198,7 @@ func.func @copies_to_asyncs_mask_2d(%a: memref<1024x1024xf32>, %i: index, %j: in // CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[I]], %[[c2]] : index // CHECK: %[[CNT:.*]] = arith.select %[[CMP]], %[[J]], %[[c0]] : index // CHECK: %[[CP0:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 4, %[[CNT]] - %submask = vector.extract %mask[2] : vector<4x4xi1> + %submask = vector.extract %mask[2] : vector<4xi1> from vector<4x4xi1> %1 = vector.transfer_read %a[%c0, %c0], %cst_0, %submask {in_bounds = [true]} : memref<1024x1024xf32>, vector<4xf32> vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space> // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp index 8064dce85885..4fb57eb74ba4 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp @@ -447,9 +447,7 @@ void ConvertToSPIRVPass::runOnOperation() { SPIRVTypeConverter typeConverter(targetAttr, options); // Additionally pull in conversion rules for GPU subgroup MMA ops. - typeConverter.addConversion([&](gpu::MMAMatrixType type) -> Type { - return convertMMAToSPIRVCoopMatrixType(type); - }); + populateMMAToSPIRVCoopMatrixTypeConversion(typeConverter); RewritePatternSet patterns(&getContext()); ScfToSPIRVContext scfToSPIRVContext; diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTile.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTile.cpp index 13a7290f7b02..0f40efe8e40f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTile.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTile.cpp @@ -138,7 +138,7 @@ static LogicalResult tileAndDistributeToThreads(linalg::LinalgOp consumerOp, // We don't distribute here; instead, it will be done in a later step // after bufferization. So add attributes to the tiled loop nest to // indicate that they should be distributed to invocations. - ArrayRef loops = tileAndFuseResult.value().loops; + ArrayRef loops = tileAndFuseResult.value().loops; const char *attrName = getSPIRVDistributeAttrName(); // We can have more than 3 dimensions being tiled (e.g., for convolutions with // non-1 batch). But only the innermost 3 dimensions are distributed. @@ -273,10 +273,10 @@ static LogicalResult tileAndUnrollConvWindow(func::FuncOp funcOp, // for parallel output window dimension, so it helps future vector // transformations. - ArrayRef loops = tileAndFuseResult.value().loops; + ArrayRef loops = tileAndFuseResult.value().loops; if (!loops.empty()) { assert(loops.size() == 1); - scf::ForOp loopOp = loops.front(); + scf::ForOp loopOp = cast(loops.front()); IntegerAttr ub; if (!matchPattern(loopOp.getUpperBound(), m_Constant(&ub))) { return loopOp.emitOpError("upper bound should be a constant"); diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir index 0f7b25d5bdfa..b6c0e1f8300b 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/break_down_large_vector.mlir @@ -49,7 +49,7 @@ func.func @bitcast_extract_extend_0(%input: vector<1xi32>) -> vector<4xi32> { // CHECK-DAG: %[[OFF1:.+]] = arith.constant 4 : i32 // CHECK-DAG: %[[OFF2:.+]] = arith.constant 8 : i32 // CHECK-DAG: %[[OFF3:.+]] = arith.constant 12 : i32 -// CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][0] : vector<1xi32> +// CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][0] : i32 from vector<1xi32> // CHECK: %[[AND0:.+]] = arith.andi %[[BASE]], %[[MASK]] : i32 // CHECK: %[[INS0:.+]] = vector.insert %[[AND0]], %[[ZERO]] [0] // CHECK: %[[SHR1:.+]] = arith.shrui %[[BASE]], %[[OFF1]] : i32 @@ -81,7 +81,7 @@ func.func @bitcast_extract_extend_1(%input: vector<4xi32>) -> vector<4xi32> { // CHECK-DAG: %[[OFF1:.+]] = arith.constant 20 : i32 // CHECK-DAG: %[[OFF2:.+]] = arith.constant 24 : i32 // CHECK-DAG: %[[OFF3:.+]] = arith.constant 28 : i32 -// CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][2] : vector<4xi32> +// CHECK: %[[BASE:.+]] = vector.extract %[[INPUT]][2] : i32 from vector<4xi32> // CHECK: %[[SHR0:.+]] = arith.shrui %[[BASE]], %[[OFF0]] : i32 // CHECK: %[[AND0:.+]] = arith.andi %[[SHR0]], %[[MASK]] : i32 // CHECK: %[[INS0:.+]] = vector.insert %[[AND0]], %[[ZERO]] [0] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir index 1d3618c2c488..adc1abe3a83f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir @@ -67,14 +67,14 @@ hal.executable @i4_dequant_matvec_f32 { // CHECK: %[[READ3:.+]] = vector.transfer_read {{.+}} : memref<86x128xf32, #hal.descriptor_type>, vector<4xf32> // CHECK: %[[EXTEND:.+]] = arith.extui %[[READ0]] : vector<4xi4> to vector<4xi32> // CHECK: %[[CVT:.+]] = arith.uitofp %[[EXTEND]] : vector<4xi32> to vector<4xf32> -// CHECK: %[[EXTRACT0:.+]] = vector.extract %[[READ1]][0] : vector<1xf32> +// CHECK: %[[EXTRACT0:.+]] = vector.extract %[[READ1]][0] : f32 from vector<1xf32> // CHECK: %[[SPLAT0:.+]] = vector.splat %[[EXTRACT0]] : vector<4xf32> // CHECK: %[[SUB:.+]] = arith.subf %[[CVT]], %[[SPLAT0]] : vector<4xf32> -// CHECK: %[[EXTRACT1:.+]] = vector.extract %[[READ2]][0] : vector<1xf32> +// CHECK: %[[EXTRACT1:.+]] = vector.extract %[[READ2]][0] : f32 from vector<1xf32> // CHECK: %[[SPLAT1:.+]] = vector.splat %[[EXTRACT1]] : vector<4xf32> // CHECK: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[SPLAT1]] : vector<4xf32> // CHECK: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<4xf32> -// CHECK: %[[EXTRACT2:.+]] = vector.extract %arg1[0] : vector<1x4xf32> +// CHECK: %[[EXTRACT2:.+]] = vector.extract %arg1[0] : vector<4xf32> from vector<1x4xf32> // CHECK: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[EXTRACT2]] : vector<4xf32> // CHECK: %[[BCAST:.+]] = vector.broadcast %[[ADD]] : vector<4xf32> to vector<1x4xf32> // CHECK: scf.yield %[[BCAST]] : vector<1x4xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir index 105af2d363cc..2cf3d6bd2daf 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir @@ -15,7 +15,7 @@ func.func @ncw_conv_1d(%input: tensor<2x4x4xf32>, %filter: tensor<4x4x1xf32>, %i // CHECK-COUNT-8: vector.transfer_read %[[INPUT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32> // CHECK-COUNT-16: vector.transfer_read %[[FILTER]]{{.+}} : tensor<4x4x1xf32>, vector<1xf32> // CHECK-COUNT-8: vector.transfer_read %[[INIT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32> -// CHECK-COUNT-16: vector.extract %{{.+}}[0] : vector<1xf32> +// CHECK-COUNT-16: vector.extract %{{.+}}[0] : f32 from vector<1xf32> // CHECK-NOT: vector.insert // CHECK-COUNT-32: vector.fma {{.+}} : vector<4xf32> // CHECK-NOT: vector.insert diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir index 847aa383600a..e4e3dd2a869c 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_elementwise_ops.mlir @@ -55,13 +55,13 @@ func.func @transpose_leading_one_dim(%input: tensor<4x1x1xf32>) -> tensor<1x1x4x // CHECK: %[[R2:.+]] = vector.transfer_read %[[INPUT]][%[[C2]], %[[C0]], %[[C0]]]{{.+}} : tensor<4x1x1xf32>, vector<1xf32> // CHECK: %[[R3:.+]] = vector.transfer_read %[[INPUT]][%[[C3]], %[[C0]], %[[C0]]]{{.+}} : tensor<4x1x1xf32>, vector<1xf32> -// CHECK: %[[E0:.+]] = vector.extract %[[R0]][0] : vector<1xf32> +// CHECK: %[[E0:.+]] = vector.extract %[[R0]][0] : f32 from vector<1xf32> // CHECK: %[[I0:.+]] = vector.insert %[[E0]], %[[ZERO]] [0] : f32 into vector<4xf32> -// CHECK: %[[E1:.+]] = vector.extract %[[R1]][0] : vector<1xf32> +// CHECK: %[[E1:.+]] = vector.extract %[[R1]][0] : f32 from vector<1xf32> // CHECK: %[[I1:.+]] = vector.insert %[[E1]], %[[I0]] [1] : f32 into vector<4xf32> -// CHECK: %[[E2:.+]] = vector.extract %[[R2]][0] : vector<1xf32> +// CHECK: %[[E2:.+]] = vector.extract %[[R2]][0] : f32 from vector<1xf32> // CHECK: %[[I2:.+]] = vector.insert %[[E2]], %[[I1]] [2] : f32 into vector<4xf32> -// CHECK: %[[E3:.+]] = vector.extract %[[R3]][0] : vector<1xf32> +// CHECK: %[[E3:.+]] = vector.extract %[[R3]][0] : f32 from vector<1xf32> // CHECK: %[[I3:.+]] = vector.insert %[[E3]], %[[I2]] [3] : f32 into vector<4xf32> // CHECK: %[[W:.+]] = vector.transfer_write %[[I3]], %{{.+}} diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir index c3e20d7b1f1c..a3e3b92d7f73 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_gather.mlir @@ -43,21 +43,21 @@ func.func @vector_gather(%arg0: memref<16x1082x1922xi8>, %index_vec: vector<16xi // CHECK-DAG: %[[INIT:.+]] = arith.constant dense<0> : vector<16xi8> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[IND0:.+]] = vector.extract %[[INDEX_VEC]][0] : vector<16xindex> +// CHECK: %[[IND0:.+]] = vector.extract %[[INDEX_VEC]][0] : index from vector<16xindex> // CHECK: %[[LOAD0:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND0]]] : memref<16x1082x1922xi8>, vector<1xi8> -// CHECK: %[[EXTRACT0:.+]] = vector.extract %[[LOAD0]][0] : vector<1xi8> +// CHECK: %[[EXTRACT0:.+]] = vector.extract %[[LOAD0]][0] : i8 from vector<1xi8> // CHECK: %[[INSERT0:.+]] = vector.insert %[[EXTRACT0]], %[[SLICE_INIT]] [0] : i8 into vector<4xi8> -// CHECK: %[[IND1:.+]] = vector.extract %[[INDEX_VEC]][1] : vector<16xindex> +// CHECK: %[[IND1:.+]] = vector.extract %[[INDEX_VEC]][1] : index from vector<16xindex> // CHECK: %[[LOAD1:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND1]]] : memref<16x1082x1922xi8>, vector<1xi8> -// CHECK: %[[EXTRACT1:.+]] = vector.extract %[[LOAD1]][0] : vector<1xi8> +// CHECK: %[[EXTRACT1:.+]] = vector.extract %[[LOAD1]][0] : i8 from vector<1xi8> // CHECK: %[[INSERT1:.+]] = vector.insert %[[EXTRACT1]], %[[INSERT0]] [1] : i8 into vector<4xi8> -// CHECK: %[[IND2:.+]] = vector.extract %[[INDEX_VEC]][2] : vector<16xindex> +// CHECK: %[[IND2:.+]] = vector.extract %[[INDEX_VEC]][2] : index from vector<16xindex> // CHECK: %[[LOAD2:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND2]]] : memref<16x1082x1922xi8>, vector<1xi8> -// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[LOAD2]][0] : vector<1xi8> +// CHECK: %[[EXTRACT2:.+]] = vector.extract %[[LOAD2]][0] : i8 from vector<1xi8> // CHECK: %[[INSERT2:.+]] = vector.insert %[[EXTRACT2]], %[[INSERT1]] [2] : i8 into vector<4xi8> -// CHECK: %[[IND3:.+]] = vector.extract %[[INDEX_VEC]][3] : vector<16xindex> +// CHECK: %[[IND3:.+]] = vector.extract %[[INDEX_VEC]][3] : index from vector<16xindex> // CHECK: %[[LOAD3:.+]] = vector.load %[[ARG0]][%[[C0]], %[[C0]], %[[IND3]]] : memref<16x1082x1922xi8>, vector<1xi8> -// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[LOAD3]][0] : vector<1xi8> +// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[LOAD3]][0] : i8 from vector<1xi8> // CHECK: %[[INSERT3:.+]] = vector.insert %[[EXTRACT3]], %[[INSERT2]] [3] : i8 into vector<4xi8> // CHECK: vector.insert_strided_slice %[[INSERT3]], %[[INIT]] {offsets = [0], strides = [1]} : vector<4xi8> into vector<16xi8> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir index 7904c95e6b1e..708858fd5a46 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir @@ -193,11 +193,11 @@ func.func @scalarize_vector_transfer_op(%arg: vector<3xf32>) -> (vector<3xf32>) // CHECK: %[[V1:.+]] = vector.insert %[[ELEM1]], %[[V0]] [1] : f32 into vector<3xf32> // CHECK: %[[ELEM2:.+]] = memref.load %{{.+}}[%[[INDEX2]]] // CHECK: %[[V2:.+]] = vector.insert %[[ELEM2]], %[[V1]] [2] : f32 into vector<3xf32> - // CHECK: %[[EXT_0:.+]] = vector.extract %{{.*}}[0] : vector<3xf32> + // CHECK: %[[EXT_0:.+]] = vector.extract %{{.*}}[0] : f32 from vector<3xf32> // CHECK: memref.store %[[EXT_0]], %{{.*}}[%[[INDEX0]]] : memref<20xf32> - // CHECK: %[[EXT_1:.+]] = vector.extract %{{.*}}[1] : vector<3xf32> + // CHECK: %[[EXT_1:.+]] = vector.extract %{{.*}}[1] : f32 from vector<3xf32> // CHECK: memref.store %[[EXT_1]], %{{.*}}[%[[INDEX1]]] : memref<20xf32> - // CHECK: %[[EXT_2:.+]] = vector.extract %{{.*}}[2] : vector<3xf32> + // CHECK: %[[EXT_2:.+]] = vector.extract %{{.*}}[2] : f32 from vector<3xf32> // CHECK: memref.store %[[EXT_2]], %{{.*}}[%[[INDEX2]]] : memref<20xf32> // CHECK: return %[[V2]] : vector<3xf32> %3 = vector.transfer_read %0[%c3], %f0 : memref<20xf32>, vector<3xf32> @@ -244,16 +244,16 @@ func.func @scalarize_non_minor_identity_transfer_write(%value: vector<4xf32>, %i // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[BUFFER:.+]] = hal.interface.binding.subspan -// CHECK: %[[E0:.+]] = vector.extract %[[VALUE]][0] : vector<4xf32> +// CHECK: %[[E0:.+]] = vector.extract %[[VALUE]][0] : f32 from vector<4xf32> // CHECK: memref.store %[[E0]], %[[BUFFER]][%[[C0]], %[[I1]], %[[I2]], %[[C0]]] // CHECK: %[[PLUS1:.+]] = affine.apply affine_map<()[s0] -> (s0 + 1)>()[%[[I2]]] -// CHECK: %[[E1:.+]] = vector.extract %[[VALUE]][1] : vector<4xf32> +// CHECK: %[[E1:.+]] = vector.extract %[[VALUE]][1] : f32 from vector<4xf32> // CHECK: memref.store %[[E1]], %[[BUFFER]][%[[C0]], %[[I1]], %[[PLUS1]], %[[C0]]] // CHECK: %[[PLUS2:.+]] = affine.apply affine_map<()[s0] -> (s0 + 2)>()[%[[I2]]] -// CHECK: %[[E2:.+]] = vector.extract %[[VALUE]][2] : vector<4xf32> +// CHECK: %[[E2:.+]] = vector.extract %[[VALUE]][2] : f32 from vector<4xf32> // CHECK: memref.store %[[E2]], %[[BUFFER]][%[[C0]], %[[I1]], %[[PLUS2]], %[[C0]]] // CHECK: %[[PLUS3:.+]] = affine.apply affine_map<()[s0] -> (s0 + 3)>()[%[[I2]]] -// CHECK: %[[E3:.+]] = vector.extract %[[VALUE]][3] : vector<4xf32> +// CHECK: %[[E3:.+]] = vector.extract %[[VALUE]][3] : f32 from vector<4xf32> // CHECK: memref.store %[[E3]], %[[BUFFER]][%[[C0]], %[[I1]], %[[PLUS3]], %[[C0]]] // ----- diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir index 69ce41752be3..ab2f8b2ab968 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir @@ -136,10 +136,10 @@ func.func @matmul_broadcast_add(%init: tensor<1x8xf32>, %a: tensor<1x8xf32>, %b: // CHECK-NOT: vector.transpose // CHECK: %[[READ:.+]] = vector.transfer_read %[[BIAS]] -// CHECK: %[[EXT0:.+]] = vector.extract %[[READ]][0] : vector<1xf32> +// CHECK: %[[EXT0:.+]] = vector.extract %[[READ]][0] : f32 from vector<1xf32> // CHECK: %[[BCST0:.+]] = vector.splat %[[EXT0]] : vector<4xf32> // CHECK: %[[ADD0:.+]] = arith.addf %{{.+}}, %[[BCST0]] : vector<4xf32> -// CHECK: %[[EXT1:.+]] = vector.extract %[[READ]][0] : vector<1xf32> +// CHECK: %[[EXT1:.+]] = vector.extract %[[READ]][0] : f32 from vector<1xf32> // CHECK: %[[BCST1:.+]] = vector.splat %[[EXT1]] : vector<4xf32> // CHECK: %[[ADD1:.+]] = arith.addf %{{.+}}, %[[BCST1]] : vector<4xf32> // CHECK: %[[WRITE0:.+]] = vector.transfer_write %[[ADD0]], %[[INIT]][%[[C0]], %[[C0]]] diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp index a5d3c1a2d2e3..a823ea4c4c05 100644 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp +++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp @@ -41,7 +41,7 @@ using transform::PrintOp; using transform::SequenceOp; using transform::SplitHandleOp; using transform::SplitReductionOp; -using transform::TileToForallOp; +using transform::TileUsingForallOp; using transform::VectorizeChildrenAndApplyPatternsOp; using transform_ext::RegisterMatchCallbacksOp; using transform_ext::TakeFirstOp; @@ -170,7 +170,7 @@ mlir::iree_compiler::buildTileFuseToScfFor(ImplicitLocOpBuilder &b, bool canonicalize) { assert(opsHToFuse.empty() && "No fusion supported yet"); iree_compiler::TileToScfForAndFuseResult result; - auto tiletoScfForOp = b.create(rootH, tileSizes); + auto tiletoScfForOp = b.create(rootH, tileSizes); result.forLoops = tiletoScfForOp.getLoops(); result.tiledOpH = tiletoScfForOp.getTiledLinalgOp(); @@ -213,7 +213,7 @@ buildTileAndFuseAndDistributeImpl(ImplicitLocOpBuilder &b, Value variantH, ArrayRef tileSizesOrNumThreads, ArrayAttr threadDimMapping) { iree_compiler::TileToForallAndFuseAndDistributeResult result; - auto tileToForeachOp = b.create( + auto tileToForeachOp = b.create( rootH, tileSizesOrNumThreads, TileOrNumThreadSpec(), threadDimMapping); result.forallH = tileToForeachOp.getForallOp(); diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp index 4465b634349c..2fdfcef400fc 100644 --- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp +++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.cpp @@ -57,7 +57,7 @@ using iree_compiler::IREE::transform_dialect:: using transform::ConvertConv2DToImg2ColOp; using transform::FuseIntoContainingOp; using transform::MatchOp; -using transform::TileOp; +using transform::TileUsingForOp; using transform_ext::RegisterMatchCallbacksOp; /// Options to set the default values of the matmul strategy. @@ -265,7 +265,7 @@ buildTileFuseToSingleScfFor(ImplicitLocOpBuilder &b, Value isolatedParentOpH, ArrayRef tileSizes) { iree_compiler::TileToScfForAndFuseResult result; Type rootType = rootH.getType(); - auto tiletoScfForOp = b.create(rootType, rootH, tileSizes); + auto tiletoScfForOp = b.create(rootType, rootH, tileSizes); result.forLoops = tiletoScfForOp.getLoops(); result.tiledOpH = tiletoScfForOp.getTiledLinalgOp(); diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir index 5a7c378589d2..6138c83051e9 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dialect_dispatch_spec.mlir @@ -1,7 +1,7 @@ transform.sequence failures(propagate) { ^bb1(%arg1: !transform.any_op): %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %foreach_op, %tiled_op = transform.structured.tile_to_forall_op %0 num_threads [42, 67] + %tiled_op, %foreach_op = transform.structured.tile_using_forall %0 num_threads [42, 67] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %dispatch_op = transform.iree.forall_to_flow %foreach_op : (!transform.any_op) -> !transform.any_op } diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h index 1ef07cdcab0d..f5e63f2a5ea3 100644 --- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h +++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h @@ -70,8 +70,9 @@ class ErrorCheckingTrackingListener : public transform::TrackingListener { } private: - void notifyPayloadReplacementNotFound(Operation *op, - ValueRange values) override; + void + notifyPayloadReplacementNotFound(Operation *op, ValueRange values, + DiagnosedSilenceableFailure &&diag) override; /// The error state of this listener. "Success" indicates that no error /// happened so far. Otherwise, the status contains the most recent error. diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp index ae2ca2f5b2eb..e5d14f0b7839 100644 --- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp +++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp @@ -349,13 +349,19 @@ mlir::transform_ext::StructuredTransformOpsExtension:: //===----------------------------------------------------------------------===// void ErrorCheckingTrackingListener::notifyPayloadReplacementNotFound( - Operation *op, ValueRange values) { + Operation *op, ValueRange values, DiagnosedSilenceableFailure &&diag) { // Certain ops can dropped safely. if (isa(op)) { LLVM_DEBUG(DBGS() << "Silently dropping scf.for op mapping\n"); return; } + SmallVector diags; + diag.takeDiagnostics(diags); + if (!status.succeeded()) + status.takeDiagnostics(diags); + status = DiagnosedSilenceableFailure::silenceableFailure(std::move(diags)); + status = emitSilenceableFailure( getTransformOp(), "!!! tracking listener failed to find replacement op"); status.attachNote(op->getLoc()) << "replaced op"; diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir index 59e203241e2a..863dc4df84e6 100644 --- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir +++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir @@ -26,7 +26,7 @@ transform.with_pdl_patterns { transform.sequence %arg0: !pdl.operation failures(propagate) { ^bb1(%arg1: !pdl.operation): %0 = pdl_match @pdl_target in %arg1 : (!pdl.operation) -> !pdl.operation - transform.structured.tile %0 [4, 4, 4] + transform.structured.tile_using_for %0 [4, 4, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) } } diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir index 4821351780e5..e0ba19702bb5 100644 --- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir +++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir @@ -5,11 +5,11 @@ transform.sequence failures(propagate) { ^bb0(%arg0: !pdl.operation): // CHECK: %[[OPS:.*]] = pdl_match @match1 in %{{.*}} %0 = pdl_match @match1 in %arg0 : (!pdl.operation) -> !pdl.operation - // CHECK: %[[TILED:.*]], %{{.*}}:3 = transform.structured.tile %[[OPS]][4, 4, 4] - %1, %loops1:3 = transform.structured.tile %0 [4, 4, 4] + // CHECK: %[[TILED:.*]], %{{.*}}:3 = transform.structured.tile_using_for %[[OPS]][4, 4, 4] + %1, %loops1:3 = transform.structured.tile_using_for %0 [4, 4, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) - // CHECK: %[[TILED2:.*]], %{{.*}}:3 = transform.structured.tile %[[TILED]] - %2, %loops2:3 = transform.structured.tile %1 [2, 2, 2] + // CHECK: %[[TILED2:.*]], %{{.*}}:3 = transform.structured.tile_using_for %[[TILED]] + %2, %loops2:3 = transform.structured.tile_using_for %1 [2, 2, 2] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation) // CHECK: %[[PADDED:.*]], %{{.*}}, %{{.*}} = transform.structured.pad %[[TILED2]] {pack_paddings = [1, 1, 0]} %3, %pad, %copy_back = transform.structured.pad %2 {pack_paddings = [1, 1, 0]} : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation) diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir index 31ad396b051e..2d4c96b62d22 100644 --- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir +++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir @@ -17,7 +17,7 @@ func.func @matmul_tensors( transform.sequence failures(propagate) { ^bb1(%module_op: !transform.any_op): %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op - %1, %loops:3 = transform.structured.tile %0 [4, 4, 4] + %1, %loops:3 = transform.structured.tile_using_for %0 [4, 4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %2 = get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op transform.structured.vectorize_children_and_apply_patterns %2 { vectorize_padding } : (!transform.any_op) -> !transform.any_op diff --git a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir index 25a2981d0e41..53fbec94150d 100644 --- a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir +++ b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir @@ -1,7 +1,7 @@ transform.sequence failures(propagate) { ^bb1(%arg1: !transform.any_op): %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %foreach_op, %tiled_op = transform.structured.tile_to_forall_op %0 num_threads [13, 33] + %tiled_op, %foreach_op = transform.structured.tile_using_forall %0 num_threads [13, 33] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %dispatch_op = transform.iree.forall_to_flow %foreach_op : (!transform.any_op) -> !transform.any_op } diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir index a914ba26dd0e..c73e822d593a 100644 --- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir +++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir @@ -7,8 +7,8 @@ transform.sequence failures(propagate) { // Tile and distribute to workgroups // ========================================== - %forall_grid, %tiled_attention = - transform.structured.tile_to_forall_op %attention num_threads [1] + %tiled_attention, %forall_grid = + transform.structured.tile_using_forall %attention num_threads [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid diff --git a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir index 02b05e9067d0..63fc65fe9f92 100644 --- a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir +++ b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir @@ -63,7 +63,7 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { // CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index // CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} -> (vector<8xf32>) { // CHECK: arith.addf %{{.*}} : vector<8x16xf32> -// CHECK-COUNT-16: vector.extract %{{.*}} : vector<16x8xf32>{{[[:space:]].*}}arith.addf %{{.*}} : vector<8xf32> +// CHECK-COUNT-16: vector.extract %{{.*}} : vector<8xf32> from vector<16x8xf32>{{[[:space:]].*}}arith.addf %{{.*}} : vector<8xf32> // CHECK: scf.yield %{{.*}} : vector<8xf32> // CHECK: } // CHECK: math.sqrt %{{.*}} : vector<8xf32> diff --git a/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir index e50f209c0aed..8bb09df90a23 100644 --- a/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir +++ b/tests/transform_dialect/cpu/matmul_codegen_custom_dispatch_formation_spec.mlir @@ -4,8 +4,8 @@ transform.sequence failures(propagate) { ^bb1(%variant_op: !transform.any_op): %0 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall, %tiled_generic = - transform.structured.tile_to_forall_op %0 num_threads [2] + %tiled_generic, %forall = + transform.structured.tile_using_forall %0 num_threads [2] // TODO: IREE needs own workgroup mapping attribute. ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir index 1e87461342ae..b84a4bac7260 100644 --- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir +++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir @@ -6,8 +6,8 @@ transform.sequence failures(propagate) { // Step 1. Tile to forall with tile_sizes [2]. // =================================================== - %forall, %tiled_generic = - transform.structured.tile_to_forall_op %matmul tile_sizes [2] + %tiled_generic, %forall = + transform.structured.tile_using_forall %matmul tile_sizes [2] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir index 6cba9829d2c6..11a6372b4739 100644 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir @@ -12,8 +12,8 @@ transform.sequence failures(propagate) { // Step 2. Tile the matmul and fuse the fill // =========================================================================== - %forall_grid, %grid_reduction = - transform.structured.tile_to_forall_op %matmul1 tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %grid_reduction, %forall_grid = + transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir index 641e14e16746..2f03915b83bc 100644 --- a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir @@ -15,8 +15,8 @@ transform.sequence failures(propagate) { // Step 2. First level of tiling + fusion parallelizes to blocks. // =========================================================================== - %forall_grid, %grid_combiner_op = - transform.structured.tile_to_forall_op %combiner_op tile_sizes [1] + %grid_combiner_op, %forall_grid = + transform.structured.tile_using_forall %combiner_op tile_sizes [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -45,7 +45,7 @@ transform.sequence failures(propagate) { // =========================================================================== %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op %forall_block_combiner_op, %block_combiner_op = - transform.structured.tile_to_forall_op %combiner_2 tile_sizes [1] + transform.structured.tile_using_forall %combiner_2 tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -56,7 +56,7 @@ transform.sequence failures(propagate) { %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op %forall_block_more_parallel_op, %block_more_parallel_op = - transform.structured.tile_to_forall_op %grid_more_parallel_op tile_sizes [1, 1] + transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir index 8744dabc9e38..0c55aa3d5a99 100644 --- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir @@ -17,8 +17,8 @@ transform.sequence failures(propagate) { // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the // trailing elementwise the same way we want to tile the reduction. // =========================================================================== - %grid_loop, %trailing_eltwise_grid_op = - transform.structured.tile_to_forall_op %trailing_eltwise tile_sizes [1] + %trailing_eltwise_grid_op, %grid_loop = + transform.structured.tile_using_forall %trailing_eltwise tile_sizes [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -48,8 +48,8 @@ transform.sequence failures(propagate) { // fuse in the leading and trailing elementwise. // =========================================================================== %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_trailing_eltwise_op, %block_trailing_eltwise_op = - transform.structured.tile_to_forall_op %trailing_eltwise_2 tile_sizes [1] + %block_trailing_eltwise_op, %forall_trailing_eltwise_op = + transform.structured.tile_using_forall %trailing_eltwise_2 tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %block_combiner_op = transform.structured.match ops{["linalg.generic"]} @@ -62,8 +62,8 @@ transform.sequence failures(propagate) { attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_block_more_parallel_op, %block_more_parallel_op = - transform.structured.tile_to_forall_op %grid_more_parallel_op tile_sizes [1, 1] + %block_more_parallel_op, %forall_block_more_parallel_op = + transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir index 5518cd4dc8a2..3300cd0cc718 100644 --- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir @@ -11,8 +11,8 @@ transform.sequence failures(propagate) { // Step 2. Tile the generic and fuse the fill and matmul // =========================================================================== - %forall_grid, %grid_reduction = - transform.structured.tile_to_forall_op %generic tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %grid_reduction, %forall_grid = + transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir index 1d74325b26c5..479aa34de9e2 100644 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir @@ -12,8 +12,8 @@ transform.sequence failures(propagate) { // Step 2. Tile the matmul and fuse the fill // =========================================================================== - %forall_grid, %grid_reduction = - transform.structured.tile_to_forall_op %broadcast tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %grid_reduction, %forall_grid = + transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir index aab96469dbe7..9d76c7367ecc 100644 --- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir @@ -10,8 +10,8 @@ transform.sequence failures(propagate) { // Step 2. Tile the matmul and fuse the fill // =========================================================================== - %forall_grid, %grid_reduction = - transform.structured.tile_to_forall_op %matmul tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %grid_reduction, %forall_grid = + transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir index 4bb4720acdcc..4fc6a495efd9 100644 --- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir @@ -14,8 +14,8 @@ transform.sequence failures(propagate) { // Step 2. First level of tiling + fusion parallelizes to blocks. // =========================================================================== - %forall_grid, %grid_combiner_op = - transform.structured.tile_to_forall_op %combiner_op tile_sizes [1] + %grid_combiner_op, %forall_grid = + transform.structured.tile_using_forall %combiner_op tile_sizes [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () @@ -26,8 +26,8 @@ transform.sequence failures(propagate) { // =========================================================================== %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_block_combiner_op, %block_combiner_op = - transform.structured.tile_to_forall_op %grid_combiner_op tile_sizes [1] + %block_combiner_op, %forall_block_combiner_op = + transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -48,8 +48,8 @@ transform.sequence failures(propagate) { %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_block_more_parallel_op, %block_more_parallel_op = - transform.structured.tile_to_forall_op %grid_more_parallel_op tile_sizes [1, 1] + %block_more_parallel_op, %forall_block_more_parallel_op = + transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir index 94a3315267b6..4a9548021f8b 100644 --- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir @@ -31,7 +31,7 @@ transform.sequence failures(propagate) { // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the // trailing elementwise the same way we want to tile the reduction. // =========================================================================== - %grid_loop, %eltwise_grid_op = transform.structured.tile_to_forall_op %eltwise + %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise tile_sizes [1] (mapping = [#gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> () @@ -53,8 +53,8 @@ transform.sequence failures(propagate) { // =========================================================================== %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op - %eltwise_block_loop, %eltwise_block_op = - transform.structured.tile_to_forall_op %eltwise_grid_op tile_sizes [1] + %eltwise_block_op, %eltwise_block_loop = + transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) %block_combiner_op = transform.structured.match ops{["linalg.generic"]} @@ -78,8 +78,8 @@ transform.sequence failures(propagate) { %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_block_more_parallel_op, %block_more_parallel_op = - transform.structured.tile_to_forall_op %grid_more_parallel_op tile_sizes [1, 1] + %block_more_parallel_op, %forall_block_more_parallel_op = + transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir index 037e17f7ce9f..f7a186d5690e 100644 --- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir @@ -7,8 +7,8 @@ transform.sequence failures(propagate) { // Step 1. First level of tiling + fusion parallelizes to blocks. // =========================================================================== - %forall_grid, %grid_reduction = - transform.structured.tile_to_forall_op %reduction tile_sizes [1] + %grid_reduction, %forall_grid = + transform.structured.tile_using_forall %reduction tile_sizes [1] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () @@ -16,11 +16,11 @@ transform.sequence failures(propagate) { // Step 2. Split the reduction to get meatier parallelism. // =========================================================================== - %forall, %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2 = - transform.structured.tile_reduction_using_scf %grid_reduction by tile_sizes = [0, 128] + %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = + transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %_1:2 = - transform.structured.tile_to_forall_op %block_more_parallel_op_2 num_threads [0, 32] + transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -28,13 +28,13 @@ transform.sequence failures(propagate) { // =========================================================================== // 1st op is [parallel, parallel], map it to threadIdx.x by 4. %_2:2 = - transform.structured.tile_to_forall_op %block_more_parallel_fill_op_2 tile_sizes [0, 4] + transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`. %_3:2 = - transform.structured.tile_to_forall_op %block_combiner_op_2 tile_sizes [1] + transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] ( mapping = [#gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir index 3677a82aaf04..4c71f83fb26d 100644 --- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir @@ -16,8 +16,8 @@ transform.sequence failures(propagate) { // Step 1. First level of tiling + fusion parallelizes to blocks. // ============================================================== - %forall, %_ = - transform.structured.tile_to_forall_op %div tile_sizes [1, 4] + %_, %forall = + transform.structured.tile_using_forall %div tile_sizes [1, 4] ( mapping = [#gpu.block, #gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () @@ -60,7 +60,7 @@ transform.sequence failures(propagate) { %tiled_exp_and_exps_sum, %tiled_exp_and_exps_sum_2 : !transform.any_op - transform.structured.tile_to_forall_op %reduction_linalg_ops tile_sizes [1, 1] + transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // Fully parallel ops are tiled and mapped. @@ -68,7 +68,7 @@ transform.sequence failures(propagate) { %tiled_exps_sum_fill, %tiled_div : !transform.any_op - transform.structured.tile_to_forall_op %parallel_linalg_ops num_threads [1, 4, 32] + transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir index 869b3cbde327..5f8175ccafb1 100644 --- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir @@ -12,8 +12,8 @@ transform.sequence failures(propagate) { %red = transform.structured.match interface{LinalgOp} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op %not_root = merge_handles %fill, %red : !transform.any_op - %forall, %tiled_generic = - transform.structured.tile_to_forall_op %root tile_sizes [1, 4] + %tiled_generic, %forall = + transform.structured.tile_using_forall %root tile_sizes [1, 4] ( mapping = [#gpu.block, #gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () @@ -26,8 +26,8 @@ transform.sequence failures(propagate) { attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op %parallel_linalg = transform.structured.match ops{["linalg.generic"]} attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_reduction, %tiled_reduction_generic = - transform.structured.tile_to_forall_op %reduction_linalg tile_sizes [1, 1] + %tiled_reduction_generic, %forall_reduction = + transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // TODO: this fusion currently does not happen properly, this is related to the clone @@ -35,7 +35,7 @@ transform.sequence failures(propagate) { // Once fixed we'll be able to fuse. // Fusion will save us one roundtrip to memory. // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction - transform.structured.tile_to_forall_op %parallel_linalg num_threads [1, 4, 32] + transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32] ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir index 3603ba665c94..dda89bf4a1bf 100644 --- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir @@ -15,8 +15,8 @@ transform.sequence failures(propagate) { // Step 1. First level of tiling + fusion parallelizes to blocks. // ============================================================== - %forall, %_ = - transform.structured.tile_to_forall_op %div tile_sizes [1, 4] + %_, %forall = + transform.structured.tile_using_forall %div tile_sizes [1, 4] ( mapping = [#gpu.block, #gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () @@ -65,7 +65,7 @@ transform.sequence failures(propagate) { %reduction_linalg_ops = transform.merge_handles %tiled_input_max, %tiled_exp_and_exps_sum : !transform.any_op - transform.structured.tile_to_forall_op %reduction_linalg_ops tile_sizes [1, 1] + transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] ( mapping = [#gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) // Fully parallel ops are tiled and mapped. @@ -73,7 +73,7 @@ transform.sequence failures(propagate) { %tiled_exps_sum_fill, %tiled_div : !transform.any_op - transform.structured.tile_to_forall_op %parallel_linalg_ops num_threads [1, 4, 32] + transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir index 12d622a16028..2f94296d4df7 100644 --- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir +++ b/tests/transform_dialect/cuda/vecadd2d_codegen_spec.mlir @@ -3,7 +3,7 @@ transform.sequence failures(propagate) { // Step 1. Find three linalg.generics and tile to GPU thread blocks. // =========================================================================== %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %forall_grid, %_ = transform.structured.tile_to_forall_op %generics + %_, %forall_grid = transform.structured.tile_using_forall %generics tile_sizes [5, 3] ( mapping = [#gpu.block, #gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () diff --git a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir index 2184cf6c2c1b..fc373cb3e92a 100644 --- a/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir +++ b/tests/transform_dialect/cuda/vecadd2d_codegen_spec_partial_tile.mlir @@ -3,7 +3,7 @@ transform.sequence failures(propagate) { %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op // Tile only one dimension, skip the other one. - %forall_grid, %_ = transform.structured.tile_to_forall_op %generics + %_, %forall_grid = transform.structured.tile_using_forall %generics tile_sizes [0, 3] ( mapping = [#gpu.block]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () diff --git a/third_party/llvm-project b/third_party/llvm-project index b0e28eb83271..435707e15f88 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit b0e28eb832710964067a17d845de15ada2da2b9c +Subproject commit 435707e15f887304dc0ff6779462262323faeb76