Skip to content

Commit

Permalink
[Codegen][CPU] Eliminate all-true vector masks after vectorization (i…
Browse files Browse the repository at this point in the history
…ree-org#18190)

This enables an upstream transform that eliminates all true
`vector.create_mask` ops. This is particularly beneficial for scalable
vectors, which use dynamic tensor types, which results in masks that
otherwise would not fold away till much later, preventing some
optimizations.

Depends on llvm/llvm-project#99314.

---------

Signed-off-by: Benjamin Maxwell <[email protected]>
  • Loading branch information
MacDue authored Aug 14, 2024
1 parent c71fe1a commit fe638b0
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 0 deletions.
19 changes: 19 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,14 @@ class GenericVectorizationPass final
void runOnOperation() override;
};

/// Converts from iree_compiler::VscaleRange to vector::VscaleRange.
static std::optional<vector::VscaleRange>
toVectorVscaleRange(std::optional<iree_compiler::VscaleRange> vscaleRange) {
if (!vscaleRange.has_value())
return std::nullopt;
return vector::VscaleRange{vscaleRange->min, vscaleRange->max};
}

void GenericVectorizationPass::runOnOperation() {
MLIRContext *context = &getContext();
auto funcOp = getOperation();
Expand Down Expand Up @@ -377,6 +385,17 @@ void GenericVectorizationPass::runOnOperation() {
vectorizeGatherAccesses);
};

{
// Eliminate (all-true) vector masks as early as possible (to avoid missing
// optimizations/folds). This is particularly beneficial for scalable
// vectors that use dynamic tensor shapes.
auto targetAttr =
iree_compiler::IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
auto vscaleRange = iree_compiler::getDefaultVscaleRange(targetAttr);
vector::eliminateVectorMasks(rewriter, funcOp,
toVectorVscaleRange(vscaleRange));
}

{
// Canonicalize mask related ops before we lower them.
RewritePatternSet maskCanonPatterns(funcOp.getContext());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,3 +445,61 @@ func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size(%arg0:
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>

// -----

#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}>
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0], [1, 4, [4], 0], [0, 0, 0, 3], [0, 0, 0, 0]]>
#map = affine_map<()[s0] -> (-(96 mod s0) + 96)>
#map1 = affine_map<(d0) -> (d0 * 2)>

func.func @depthwise_conv_fold_away_masking(%arg0: tensor<1x68x120x96xf32>, %arg1: tensor<1x137x241x96xf32>, %arg2: tensor<3x3x96xf32>) -> tensor<1x68x120x96xf32>
attributes {hal.executable.target = #aarch64_sve}
{
%c3 = arith.constant 3 : index
%c120 = arith.constant 120 : index
%c68 = arith.constant 68 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%vscale = vector.vscale
%c4_vscale = arith.muli %vscale, %c4 : index
%0 = scf.for %arg3 = %c0 to %c68 step %c1 iter_args(%arg4 = %arg0) -> (tensor<1x68x120x96xf32>) {
%1 = scf.for %arg5 = %c0 to %c120 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x68x120x96xf32>) {
%2 = affine.apply #map()[%c4_vscale]
%3 = scf.for %arg7 = %c0 to %2 step %c4_vscale iter_args(%arg8 = %arg6) -> (tensor<1x68x120x96xf32>) {
%4 = affine.apply #map1(%arg3)
%5 = affine.apply #map1(%arg5)
%extracted_slice = tensor.extract_slice %arg1[0, %4, %5, %arg7] [1, 3, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x137x241x96xf32> to tensor<1x3x9x?xf32>
%extracted_slice_0 = tensor.extract_slice %arg2[0, 0, %arg7] [3, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x96xf32> to tensor<3x3x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x68x120x96xf32> to tensor<1x1x4x?xf32>
%6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32>
%7 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %6) -> (tensor<1x1x4x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg9, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x3x9x?xf32> to tensor<1x1x9x?xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg9, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x?xf32> to tensor<1x3x?xf32>
%extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x1x4x?xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x9x?xf32> to tensor<1x9x?xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<1x3x?xf32> to tensor<3x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x4x?xf32>
%8 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, lowering_config = #config, strides = dense<2> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x9x?xf32>, tensor<3x?xf32>) outs(%extracted_slice_7 : tensor<1x4x?xf32>) -> tensor<1x4x?xf32>
%inserted_slice_8 = tensor.insert_slice %8 into %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x4x?xf32> into tensor<1x1x4x?xf32>
%inserted_slice_9 = tensor.insert_slice %inserted_slice_8 into %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x1x4x?xf32>
scf.yield %inserted_slice_9 : tensor<1x1x4x?xf32>
}
%inserted_slice = tensor.insert_slice %7 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x68x120x96xf32>
scf.yield %inserted_slice : tensor<1x68x120x96xf32>
}
scf.yield %3 : tensor<1x68x120x96xf32>
}
scf.yield %1 : tensor<1x68x120x96xf32>
}
return %0 : tensor<1x68x120x96xf32>
}

/// This checks that the masks (introduced by the vectorizer) are eliminated by
/// the end of the iree-codegen-generic-vectorization pass.

// CHECK-MASK-LABEL: func.func @depthwise_conv_fold_away_masking
// CHECK-MASK-NOT: vector.create_mask
// CHECK-MASK-NOT: vector.constant_mask

0 comments on commit fe638b0

Please sign in to comment.