Skip to content

Commit

Permalink
Revert "[Codegen] Add vector transfer + slice foldings in GenericVect…
Browse files Browse the repository at this point in the history
…orization (iree-org#17613)" (iree-org#17997)

This reverts commit 8b83425.

This change is hurting SVE+SME performance pretty badly. See
iree-org#17613 for context.

Signed-off-by: Andrzej Warzynski <[email protected]>
  • Loading branch information
banach-space authored Jul 24, 2024
1 parent 2af25b5 commit 10877f6
Show file tree
Hide file tree
Showing 11 changed files with 114 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,6 @@ class GenericVectorizationPass
this->generateContract.setValue(options.generateContract);
this->foldCastIntoContract.setValue(options.foldCastIntoContract);
this->maxVectorSize.setValue(options.maxVectorSize);
this->earlySubsetTransferFolding.setValue(
options.earlySubsetTransferFolding);
}

void getDependentDialects(DialectRegistry &registry) const override {
Expand Down Expand Up @@ -386,17 +384,8 @@ void GenericVectorizationPass::runOnOperation() {
};

{
// Canonicalize mask related ops before we lower them. Also run patterns
// for vector transfers on tensor subset ops, since they can be folded if
// not handled here.
// Canonicalize mask related ops before we lower them.
RewritePatternSet maskCanonPatterns(funcOp.getContext());
if (earlySubsetTransferFolding) {
// It is important to add these vector transfer on tensor subset patterns
// in the first greedy pattern rewrite, since transfer foldings can remove
// vectorized reads and writes by folding them into tensor ops.
tensor::populateFoldTensorSubsetIntoVectorTransferPatterns(
maskCanonPatterns);
}
vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
funcOp.getContext());
vector::ConstantMaskOp::getCanonicalizationPatterns(maskCanonPatterns,
Expand Down
2 changes: 0 additions & 2 deletions compiler/src/iree/compiler/Codegen/Common/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,6 @@ struct GenericVectorizationPassOptions {
bool foldCastIntoContract = false;
// Max vector size allowed to avoid creating large vectors.
int64_t maxVectorSize = std::numeric_limits<int64_t>::max();
// Enable early folding of tensor subset ops into vector transfer ops.
bool earlySubsetTransferFolding = true;
};
/// Creates a pass to perform vectorization on LinAlg and tensor ops.
std::unique_ptr<InterfacePass<FunctionOpInterface>>
Expand Down
4 changes: 1 addition & 3 deletions compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,7 @@ def GenericVectorization :
"Enable folding casting ops into vector.contract.">,
Option<"maxVectorSize", "max-vector-size", "int64_t",
/*default=*/"2147483647",
"Max vector size allowed to avoid creating large vectors.">,
Option<"earlySubsetTransferFolding", "early-subset-transfer-folding", "bool",/*default=*/"true",
"Enable early folding of tensor subset ops into vector transfer ops.">
"Max vector size allowed to avoid creating large vectors.">
];
let constructor =
"mlir::iree_compiler::createGenericVectorizationPass()";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_static_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -79,8 +79,9 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -129,12 +130,12 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK: tensor.pack

// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8
Expand All @@ -144,8 +145,9 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor<?x?xi8>, %arg1: t
// CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]]
// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]]
// CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8>
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1]
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor<?x?x16x2xi8>
Expand Down Expand Up @@ -202,13 +204,13 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
}
return %3 : tensor<32x?x64x16x2xbf16>
}
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK-DAG: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)>
// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)>
// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)>
// CHECK-MASK: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-MASK-LABEL: func.func @generic_pack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0_BF16:.+]] = arith.constant 0.000000e+00 : bf16
Expand All @@ -227,8 +229,9 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor<?x32x128xf32>) -> tensor
// CHECK-MASK-DAG: %[[SRC_SZ0:.+]] = affine.min #[[$MAP4]]
// CHECK-MASK-DAG: %[[SRC_SZ2:.+]] = affine.min #[[$MAP3]]
// CHECK-MASK-DAG: %[[ITER_SLICE:.+]] = tensor.extract_slice %[[GENERIC_EMPTY]]
// CHECK-MASK-DAG: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[SRC_SZ0]], 2, %[[SRC_SZ2]]]
// CHECK-MASK-DAG: %[[READ_MASK:.+]] = vector.create_mask %[[SRC_SZ0]], %[[C2]], %[[SRC_SZ2]] : vector<64x2x12xi1>
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC]]{{.+}} %[[READ_MASK]]
// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}} %[[READ_MASK]]
// CHECK-MASK-DAG: %[[WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[SRC_SZ2]], %[[SRC_SZ0]] : vector<2x12x64xi1>
// CHECK-MASK: %[[TRUNC:.+]] = arith.truncf %[[GENERIC_READ]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[TRUNC]], [1, 2, 0]
Expand Down Expand Up @@ -275,10 +278,10 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -289,8 +292,9 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>,
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -334,10 +338,10 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)>
// CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
Expand All @@ -348,8 +352,9 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
// CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]]
// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]]
// CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]]
// CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3]
// CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32>
// CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor<?x?xf32>
Expand Down Expand Up @@ -399,46 +404,4 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x67x120x176xf32>

// -----

#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
func.func @tiled_linalg_copy(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg1, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg1) -> (tensor<?x?xf32>) {
%1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%2 = affine.min #map(%arg3)[%dim]
%3 = affine.min #map1(%arg5)[%dim_0]
%extracted_slice_0 = tensor.extract_slice %arg0[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%copy = linalg.copy ins(%extracted_slice_0 : tensor<?x?xf32>) outs(%extracted_slice_1 : tensor<?x?xf32>) -> tensor<?x?xf32>
%inserted_slice = tensor.insert_slice %copy into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %1 : tensor<?x?xf32>
}
return %0 : tensor<?x?xf32>
}
// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)>
// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
// CHECK-MASK-LABEL: func.func @tiled_linalg_copy
// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<?x?xf32>, %[[DST:[a-zA-Z0-9]+]]
// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-MASK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-MASK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-MASK-DAG: %[[C32:.+]] = arith.constant 32 : index
// CHECK-MASK: scf.for %[[IV0:.+]] = %[[C0]]
// CHECK-MASK: scf.for %[[IV1:.+]] = %[[C0]] {{.*}} iter_args(%[[ITER_ARG:.+]] = {{.*}})
// CHECK-MASK-DAG: %[[DST_SZ0:.+]] = affine.min #[[$MAP0]]
// CHECK-MASK-DAG: %[[DST_SZ1:.+]] = affine.min #[[$MAP1]]
// CHECK-MASK: %[[DST_SLICE:.+]] = tensor.extract_slice %[[DST]][%[[IV0]], %[[IV1]]] [%[[DST_SZ0]], %[[DST_SZ1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK-MASK: %[[MASK:.+]] = vector.create_mask %[[DST_SZ0]], %[[DST_SZ1]] : vector<16x32xi1>
// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[IV0]], %[[IV1]]],{{.*}} %[[MASK]]{{.*}} : tensor<?x?xf32>, vector<16x32xf32>
// CHECK-MASK: vector.transfer_write %[[READ]], %[[DST_SLICE]]{{.+}}, %[[MASK]]
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ module {
// CHECK: scf.for
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]]
// CHECK: %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) {
// CHECK: %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]]
// CHECK: scf.yield %[[VEC_LOAD]]
// CHECK: }
// CHECK: vector.store %[[RESULT_VEC]], %[[OUTPUT_SUBVIEW]]
// CHECK: %[[DROP_UNIT_OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]]
// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]]

// -----
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
// CHECK: scf.for {{.*}} iter_args(%[[OUT_TENSOR:.*]] = {{.*}}) -> (tensor<1024x1024xf32>) {
// CHECK-NEXT: scf.for {{.*}} iter_args(%[[OUT_TENSOR_1:.*]] = %[[OUT_TENSOR]]) -> (tensor<1024x1024xf32>) {
// CHECK-NEXT: %[[OUT_SLICE:.*]] = tensor.extract_slice %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32> to tensor<8x?xf32>
// CHECK-NEXT: %[[OUT_SLICE_1:.*]] = tensor.extract_slice %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> to tensor<8x?xf32>
// CHECK-NEXT: %[[OUT_VEC:.*]] = vector.transfer_read %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32>, vector<8x[16]xf32>
// CHECK-NEXT: %[[INNER_LOOP:.*]] = scf.for {{.*}} iter_args(%[[RES:.*]] = %[[OUT_VEC]]) -> (vector<8x[16]xf32>) {
// CHECK-NEXT: %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<1024x1024xf32>, vector<8x1xf32>
Expand All @@ -29,8 +30,9 @@
// CHECK-SAME: %[[LHS]], %[[RHS]], %[[RES]] : vector<8x1xf32>, vector<1x[16]xf32> into vector<8x[16]xf32>
// CHECK-NEXT: scf.yield %[[CONTRACT]] : vector<8x[16]xf32>
// CHECK-NEXT: }
// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32>
// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32>
// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE_1]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32>
// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> into tensor<8x?xf32>
// CHECK-NEXT: tensor.insert_slice %[[INSERT_SLICE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32>

func.func @pipeline() {
%c1 = arith.constant 1 : index
Expand Down
Loading

0 comments on commit 10877f6

Please sign in to comment.