From 80c33b0ab5aae5003ee591eae191d44b9791994f Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 29 Jul 2024 11:25:50 +0100 Subject: [PATCH] [Codegen] Support inferring scalable vectors for remainder loops (#17998) Follow on for #17891 that extends the scalable vector size inference to work for remainder loops. The upper bound of a scalable remainder dim is an expression of the form `(vscale * n) + cst` (cst <= 0). This patch adds some simple pattern matching that rounds up the upper bound by removing the `+ cst`, which allows matching the scalable quantity. --------- Signed-off-by: Benjamin Maxwell --- .../Codegen/Common/GenericVectorization.cpp | 3 +- .../Common/test/generic_vectorization.mlir | 40 +++++++++++++++++++ .../src/iree/compiler/Codegen/Utils/Utils.cpp | 25 ++++++++++-- .../src/iree/compiler/Codegen/Utils/Utils.h | 7 +++- 4 files changed, 69 insertions(+), 6 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp index e36a9c789092..6c24885a9c84 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp @@ -90,7 +90,8 @@ inferSizesFromIR(linalg::LinalgOp linalgOp, std::optional opResult) { for (auto operandDimPair : operandDimPairs) { Value operand = operandDimPair.first; unsigned operandDim = operandDimPair.second; - maybeDimBound = computeDimUpperBound(operand, operandDim, vscaleRange); + maybeDimBound = computeDimUpperBound(operand, operandDim, vscaleRange, + RoundUpVscaleMultiple::Yes); if (succeeded(maybeDimBound)) { break; } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir index 3f0947d43f91..2ce61865ef58 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir @@ -405,3 +405,43 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6 // CHECK-MASK: scf.for // CHECK-MASK: scf.for // CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32> + +// ----- + +#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}> + +func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size(%arg0: tensor<1x67x120x100xf32>) -> tensor<1x67x120x100xf32> + attributes {hal.executable.target = #aarch64_sve} +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c100 = arith.constant 100 : index + %c67 = arith.constant 67 : index + %c120 = arith.constant 120 : index + %cst = arith.constant 0.000000e+00 : f32 + %vscale = vector.vscale + %c4_vscale = arith.muli %vscale, %c4 : index + %0 = scf.for %arg1 = %c0 to %c67 step %c1 iter_args(%arg2 = %arg0) -> (tensor<1x67x120x100xf32>) { + %1 = scf.for %arg3 = %c0 to %c120 step %c4 iter_args(%arg4 = %arg2) -> (tensor<1x67x120x100xf32>) { + %rem_start = affine.apply affine_map<()[s0] -> (-(100 mod s0) + 100)>()[%c4_vscale] + %3 = scf.for %arg5 = %rem_start to %c100 step %c4_vscale iter_args(%arg6 = %arg4) -> (tensor<1x67x120x100xf32>) { + %rem_elts = affine.apply affine_map<(d0) -> (-d0 + 100)>(%arg5) + %extracted_slice = tensor.extract_slice %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %rem_elts] [1, 1, 1, 1] : tensor<1x67x120x100xf32> to tensor<1x1x4x?xf32> + %4 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32> + %inserted_slice = tensor.insert_slice %4 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %rem_elts] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x67x120x100xf32> + scf.yield %inserted_slice : tensor<1x67x120x100xf32> + } + scf.yield %3 : tensor<1x67x120x100xf32> + } + scf.yield %1 : tensor<1x67x120x100xf32> + } + return %0 : tensor<1x67x120x100xf32> +} + +// CHECK-MASK-LABEL: func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size +// CHECK-MASK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x4x[4]xf32> +// CHECK-MASK: scf.for +// CHECK-MASK: scf.for +// CHECK-MASK: scf.for +// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32> diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp index 755ba1256da0..18ba760f06bb 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp @@ -1159,7 +1159,8 @@ getDefaultVscaleRange(IREE::HAL::ExecutableTargetAttr targetAttr) { FailureOr computeDimUpperBound(Value shapedValue, unsigned dimNum, - std::optional vscaleRange) { + std::optional vscaleRange, + RoundUpVscaleMultiple roundUp) { if (!vscaleRange.has_value()) { FailureOr maybeDimBoundSize = ValueBoundsConstraintSet::computeConstantBound( @@ -1175,9 +1176,25 @@ computeDimUpperBound(Value shapedValue, unsigned dimNum, shapedValue, dimNum, /*vscaleMin=*/vscaleRange->min, /*vscaleMax=*/vscaleRange->max, presburger::BoundType::UB); - if (succeeded(maybeDimBound)) - return maybeDimBound->getSize(); - return failure(); + if (failed(maybeDimBound)) + return failure(); + auto boundSize = maybeDimBound->getSize(); + if (succeeded(boundSize)) + return boundSize; + if (roundUp == RoundUpVscaleMultiple::No) + return failure(); + // If the upper bound map is of the form `add(subExpr, cst)` (cst <= 0), + // round it up to `subExpr` (and try matching the bound again). + auto binOp = dyn_cast(maybeDimBound->map.getResult(0)); + if (!binOp || binOp.getKind() != AffineExprKind::Add) + return failure(); + auto cst = dyn_cast(binOp.getRHS()); + if (!cst || cst.getValue() > 0) + return failure(); + DimBound roundedDimBound{AffineMap::get(maybeDimBound->map.getNumDims(), + maybeDimBound->map.getNumSymbols(), + binOp.getLHS())}; + return roundedDimBound.getSize(); } } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h index ae53df006a81..20228bd813db 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h @@ -237,12 +237,17 @@ getDefaultVscaleRange(IREE::HAL::ExecutableTargetAttr targetAttr); using DimBound = vector::ConstantOrScalableBound; using DimBoundSize = DimBound::BoundSize; +/// Should the scalable upper bound be rounded up to the nearest multiple of +/// vscale? +enum class RoundUpVscaleMultiple { No, Yes }; + /// Computes the upper bound of `dimNum` dim of the ShapedType value /// `shapedValue`. If the optional `vscaleRange` is provided then the computed /// bound can be a scalable quantity. FailureOr computeDimUpperBound(Value shapedValue, unsigned dimNum, - std::optional vscaleRange); + std::optional vscaleRange, + RoundUpVscaleMultiple = RoundUpVscaleMultiple::No); } // namespace mlir::iree_compiler