From 10877f61c2e73d578bb889012b84bbd1edf15ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 24 Jul 2024 19:47:48 +0100 Subject: [PATCH] Revert "[Codegen] Add vector transfer + slice foldings in GenericVectorization (#17613)" (#17997) This reverts commit 8b8342596bb399f0c699e980d87b17875905d66e. This change is hurting SVE+SME performance pretty badly. See https://github.com/iree-org/iree/pull/17613 for context. Signed-off-by: Andrzej Warzynski --- .../Codegen/Common/GenericVectorization.cpp | 13 +- .../src/iree/compiler/Codegen/Common/Passes.h | 2 - .../iree/compiler/Codegen/Common/Passes.td | 4 +- .../Common/test/generic_vectorization.mlir | 113 ++++++------------ .../LLVMCPU/test/pipeline_pad_tests.mlir | 4 +- .../vectorize_with_masking_and_hoist.mlir | 6 +- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 7 +- .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 + .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 + .../test/ROCDL/pipeline_tile_and_fuse.mlir | 2 + .../LLVMGPU/test/conv_pipeline_test_rocm.mlir | 61 ++++++++++ 11 files changed, 114 insertions(+), 100 deletions(-) create mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp index 0a2bea0a9910..e36a9c789092 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp @@ -322,8 +322,6 @@ class GenericVectorizationPass this->generateContract.setValue(options.generateContract); this->foldCastIntoContract.setValue(options.foldCastIntoContract); this->maxVectorSize.setValue(options.maxVectorSize); - this->earlySubsetTransferFolding.setValue( - options.earlySubsetTransferFolding); } void getDependentDialects(DialectRegistry ®istry) const override { @@ -386,17 +384,8 @@ void GenericVectorizationPass::runOnOperation() { }; { - // Canonicalize mask related ops before we lower them. Also run patterns - // for vector transfers on tensor subset ops, since they can be folded if - // not handled here. + // Canonicalize mask related ops before we lower them. RewritePatternSet maskCanonPatterns(funcOp.getContext()); - if (earlySubsetTransferFolding) { - // It is important to add these vector transfer on tensor subset patterns - // in the first greedy pattern rewrite, since transfer foldings can remove - // vectorized reads and writes by folding them into tensor ops. - tensor::populateFoldTensorSubsetIntoVectorTransferPatterns( - maskCanonPatterns); - } vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns, funcOp.getContext()); vector::ConstantMaskOp::getCanonicalizationPatterns(maskCanonPatterns, diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h index 621fb35d2e62..2880477d0a2b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.h +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h @@ -174,8 +174,6 @@ struct GenericVectorizationPassOptions { bool foldCastIntoContract = false; // Max vector size allowed to avoid creating large vectors. int64_t maxVectorSize = std::numeric_limits::max(); - // Enable early folding of tensor subset ops into vector transfer ops. - bool earlySubsetTransferFolding = true; }; /// Creates a pass to perform vectorization on LinAlg and tensor ops. std::unique_ptr> diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td index 0f313c55a8d5..ed182941c372 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td @@ -288,9 +288,7 @@ def GenericVectorization : "Enable folding casting ops into vector.contract.">, Option<"maxVectorSize", "max-vector-size", "int64_t", /*default=*/"2147483647", - "Max vector size allowed to avoid creating large vectors.">, - Option<"earlySubsetTransferFolding", "early-subset-transfer-folding", "bool",/*default=*/"true", - "Enable early folding of tensor subset ops into vector transfer ops."> + "Max vector size allowed to avoid creating large vectors."> ]; let constructor = "mlir::iree_compiler::createGenericVectorizationPass()"; diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir index 924fe5b5f950..3f0947d43f91 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir @@ -64,12 +64,12 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1 // CHECK-LABEL: func.func @single_static_pack_infer_vector_size // CHECK: tensor.pack -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)> -// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)> -// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)> +// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)> +// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)> +// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)> +// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)> // CHECK-MASK-LABEL: func.func @single_static_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8 @@ -79,8 +79,9 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1 // CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]] // CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]] +// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]] // CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8> // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1] // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor @@ -129,12 +130,12 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor, %arg1: t // CHECK-LABEL: func.func @single_dynamic_pack_infer_vector_size // CHECK: tensor.pack -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> -// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> +// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> +// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> // CHECK-MASK-LABEL: func.func @single_dynamic_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8 @@ -144,8 +145,9 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor, %arg1: t // CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]] // CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]] +// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]] // CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8> // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1] // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor @@ -202,13 +204,13 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor) -> tensor } return %3 : tensor<32x?x64x16x2xbf16> } -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)> -// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)> -// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)> -// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> -// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK-DAG: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> +// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)> +// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)> +// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)> +// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> +// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)> // CHECK-MASK-LABEL: func.func @generic_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0_BF16:.+]] = arith.constant 0.000000e+00 : bf16 @@ -227,8 +229,9 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor) -> tensor // CHECK-MASK-DAG: %[[SRC_SZ0:.+]] = affine.min #[[$MAP4]] // CHECK-MASK-DAG: %[[SRC_SZ2:.+]] = affine.min #[[$MAP3]] // CHECK-MASK-DAG: %[[ITER_SLICE:.+]] = tensor.extract_slice %[[GENERIC_EMPTY]] +// CHECK-MASK-DAG: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[SRC_SZ0]], 2, %[[SRC_SZ2]]] // CHECK-MASK-DAG: %[[READ_MASK:.+]] = vector.create_mask %[[SRC_SZ0]], %[[C2]], %[[SRC_SZ2]] : vector<64x2x12xi1> -// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC]]{{.+}} %[[READ_MASK]] +// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}} %[[READ_MASK]] // CHECK-MASK-DAG: %[[WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[SRC_SZ2]], %[[SRC_SZ0]] : vector<2x12x64xi1> // CHECK-MASK: %[[TRUNC:.+]] = arith.truncf %[[GENERIC_READ]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[TRUNC]], [1, 2, 0] @@ -275,10 +278,10 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor, } return %0 : tensor } -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> -// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> +// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> +// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> +// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> // CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -289,8 +292,9 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor, // CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]] // CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]] +// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3] // CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32> // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor @@ -334,10 +338,10 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor, %arg1: } return %0 : tensor } -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> -// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> +// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> +// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> +// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> // CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -348,8 +352,9 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor, %arg1: // CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]] // CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]] +// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3] // CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32> // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor @@ -399,46 +404,4 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6 // CHECK-MASK: scf.for // CHECK-MASK: scf.for // CHECK-MASK: scf.for -// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x67x120x176xf32> - -// ----- - -#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)> -#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)> -func.func @tiled_linalg_copy(%arg0: tensor, %arg1: tensor) -> tensor { - %c32 = arith.constant 32 : index - %c16 = arith.constant 16 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %dim = tensor.dim %arg1, %c0 : tensor - %dim_0 = tensor.dim %arg1, %c1 : tensor - %0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg1) -> (tensor) { - %1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor) { - %2 = affine.min #map(%arg3)[%dim] - %3 = affine.min #map1(%arg5)[%dim_0] - %extracted_slice_0 = tensor.extract_slice %arg0[%arg3, %arg5] [%2, %3] [1, 1] : tensor to tensor - %extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor to tensor - %copy = linalg.copy ins(%extracted_slice_0 : tensor) outs(%extracted_slice_1 : tensor) -> tensor - %inserted_slice = tensor.insert_slice %copy into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor into tensor - scf.yield %inserted_slice : tensor - } - scf.yield %1 : tensor - } - return %0 : tensor -} -// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> -// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> -// CHECK-MASK-LABEL: func.func @tiled_linalg_copy -// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor, %[[DST:[a-zA-Z0-9]+]] -// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-MASK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-MASK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-MASK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-MASK: scf.for %[[IV0:.+]] = %[[C0]] -// CHECK-MASK: scf.for %[[IV1:.+]] = %[[C0]] {{.*}} iter_args(%[[ITER_ARG:.+]] = {{.*}}) -// CHECK-MASK-DAG: %[[DST_SZ0:.+]] = affine.min #[[$MAP0]] -// CHECK-MASK-DAG: %[[DST_SZ1:.+]] = affine.min #[[$MAP1]] -// CHECK-MASK: %[[DST_SLICE:.+]] = tensor.extract_slice %[[DST]][%[[IV0]], %[[IV1]]] [%[[DST_SZ0]], %[[DST_SZ1]]] [1, 1] : tensor to tensor -// CHECK-MASK: %[[MASK:.+]] = vector.create_mask %[[DST_SZ0]], %[[DST_SZ1]] : vector<16x32xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[IV0]], %[[IV1]]],{{.*}} %[[MASK]]{{.*}} : tensor, vector<16x32xf32> -// CHECK-MASK: vector.transfer_write %[[READ]], %[[DST_SLICE]]{{.+}}, %[[MASK]] +// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir index 27898749a907..045193a29cea 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir @@ -33,11 +33,13 @@ module { // CHECK: scf.for // CHECK: scf.for // CHECK: scf.for +// CHECK: %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]] // CHECK: %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) { // CHECK: %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]] // CHECK: scf.yield %[[VEC_LOAD]] // CHECK: } -// CHECK: vector.store %[[RESULT_VEC]], %[[OUTPUT_SUBVIEW]] +// CHECK: %[[DROP_UNIT_OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]] +// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]] // ----- #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir index 5f854dc25d60..b6450d346fcc 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir @@ -20,6 +20,7 @@ // CHECK: scf.for {{.*}} iter_args(%[[OUT_TENSOR:.*]] = {{.*}}) -> (tensor<1024x1024xf32>) { // CHECK-NEXT: scf.for {{.*}} iter_args(%[[OUT_TENSOR_1:.*]] = %[[OUT_TENSOR]]) -> (tensor<1024x1024xf32>) { // CHECK-NEXT: %[[OUT_SLICE:.*]] = tensor.extract_slice %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32> to tensor<8x?xf32> +// CHECK-NEXT: %[[OUT_SLICE_1:.*]] = tensor.extract_slice %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> to tensor<8x?xf32> // CHECK-NEXT: %[[OUT_VEC:.*]] = vector.transfer_read %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32>, vector<8x[16]xf32> // CHECK-NEXT: %[[INNER_LOOP:.*]] = scf.for {{.*}} iter_args(%[[RES:.*]] = %[[OUT_VEC]]) -> (vector<8x[16]xf32>) { // CHECK-NEXT: %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<1024x1024xf32>, vector<8x1xf32> @@ -29,8 +30,9 @@ // CHECK-SAME: %[[LHS]], %[[RHS]], %[[RES]] : vector<8x1xf32>, vector<1x[16]xf32> into vector<8x[16]xf32> // CHECK-NEXT: scf.yield %[[CONTRACT]] : vector<8x[16]xf32> // CHECK-NEXT: } -// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32> -// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32> +// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE_1]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32> +// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> into tensor<8x?xf32> +// CHECK-NEXT: tensor.insert_slice %[[INSERT_SLICE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32> func.func @pipeline() { %c1 = arith.constant 1 : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 516d7dcc9fe1..3b8b847045ec 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -239,8 +239,7 @@ static void tileAndBufferize(OpPassManager &funcPassManager) { addBufferizePasses(funcPassManager); } -static void addGPUVectorizationPasses(OpPassManager &funcPassManager, - bool earlySubsetTransferFolding = true) { +static void addGPUVectorizationPasses(OpPassManager &funcPassManager) { funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); // Vectorize. GenericVectorizationPassOptions options; @@ -248,7 +247,6 @@ static void addGPUVectorizationPasses(OpPassManager &funcPassManager, options.vectorizeGatherAccesses = true; options.enableCleanup = false; options.foldCastIntoContract = true; - options.earlySubsetTransferFolding = earlySubsetTransferFolding; funcPassManager.addPass(createGenericVectorizationPass(options)); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -775,8 +773,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); // Linalg -> Vector - addGPUVectorizationPasses(funcPassManager, - /*earlySubsetTransferFolding=*/false); + addGPUVectorizationPasses(funcPassManager); // Allocate tensors for copies to shared memory. funcPassManager.addPass(createGPUVectorAllocPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index e75bfc66c986..1757b5ce48f4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -24,6 +24,7 @@ iree_lit_test_suite( "attention.mlir", "attention_mfma.mlir", "conv_pipeline_test_cuda.mlir", + "conv_pipeline_test_rocm.mlir", "convert_to_nvvm.mlir", "convert_to_rocdl.mlir", "create_async_groups.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 6d87f02af258..2ff84aa75ea2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -24,6 +24,7 @@ iree_lit_test_suite( "config_matvec.mlir" "config_winograd.mlir" "conv_pipeline_test_cuda.mlir" + "conv_pipeline_test_rocm.mlir" "convert_to_nvvm.mlir" "convert_to_rocdl.mlir" "create_async_groups.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index d5ad390fa821..95463a872aa2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -48,6 +48,7 @@ hal.executable public @main { // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>) +// CHECK: gpu.barrier // CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16> // CHECK: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]] // CHECK: gpu.barrier @@ -108,6 +109,7 @@ hal.executable public @main { // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) +// CHECK: gpu.barrier // CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> // CHECK: vector.transfer_write %[[LHS_RD]] // CHECK: gpu.barrier diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir new file mode 100644 index 000000000000..fbc4faa1b2b6 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir @@ -0,0 +1,61 @@ +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \ +// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \ +// RUN: %s | FileCheck %s + +#layout = #hal.pipeline.layout, + <1, storage_buffer, ReadOnly>, + <2, storage_buffer, ReadOnly>, + <3, storage_buffer> + ]> + ]> +hal.executable private @conv_nchw_dispatch_1 { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 ordinal(0) layout(#layout) attributes { + hal.interface.bindings = [ + #hal.interface.binding<0, 0>, + #hal.interface.binding<0, 1>, + #hal.interface.binding<0, 2>, + #hal.interface.binding<0, 3> + ], + translation_info = #iree_codegen.translation_info} { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16() { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 320, 130, 130], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x320x130x130xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x320x3x3xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [320], strides = [1] : !flow.dispatch.tensor> -> tensor<320xf16> + %7 = tensor.empty() : tensor<2x320x64x64xf16> + %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%7 : tensor<2x320x64x64xf16>) -> tensor<2x320x64x64xf16> + %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<2> : vector<2xi64>} ins(%4, %5 : tensor<2x320x130x130xf16>, tensor<320x320x3x3xf16>) outs(%8 : tensor<2x320x64x64xf16>) -> tensor<2x320x64x64xf16> + %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<2x320x64x64xf16>, tensor<320xf16>) outs(%7 : tensor<2x320x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %11 = arith.addf %in, %in_0 : f16 + linalg.yield %11 : f16 + } -> tensor<2x320x64x64xf16> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<2x320x64x64xf16> -> !flow.dispatch.tensor> + return + } + } + } +} + +// TODO: This test reflects a bug related to how the convolution is bufferized +// for the LLVMGPUVectorize pipeline, meaning these local memory allocations are +// not desired. This test should be dropped once the extra buffers have been +// eliminated. + +// CHECK-LABEL: func @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 +// CHECK-COUNT-3: memref.alloc() : memref<1x1x1x4xf16, #gpu.address_space> +// CHECK-COUNT-3: memref.copy %{{.*}}, %{{.*}} : memref<1x1x1x4xf16, #gpu.address_space> to memref<{{.*}} #hal.descriptor_type>