From cfc79eaddf22ac9a69b1557722f905928972fca5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 19 Jul 2024 14:03:11 +0100
Subject: [PATCH] [Codegen] Support inferring scalable vector sizes (#17891)

This patch extends generic vectorization to support inferring scalable
vector sizes for linalg ops (using the
ScalableValueBoundsConstraintSet).

Note: Inferring scalable sizes for tensor.pack/unpack is not supported.

---------

Signed-off-by: Benjamin Maxwell <benjamin.maxwell@arm.com>
---
 .../Codegen/Common/GenericVectorization.cpp   | 45 +++++++++++++------
 .../Common/test/generic_vectorization.mlir    | 39 ++++++++++++++++
 .../src/iree/compiler/Codegen/Utils/Utils.cpp | 22 +++++++++
 .../src/iree/compiler/Codegen/Utils/Utils.h   | 11 +++++
 4 files changed, 104 insertions(+), 13 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
index 16a04bb308a2..e36a9c789092 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
@@ -29,6 +29,7 @@ namespace {
 struct VectorizationTileSizes {
   SmallVector<int64_t> destShape;
   SmallVector<int64_t> vectorSizes;
+  SmallVector<bool> vectorScalableFlags;
 };
 
 /// Returns a VectorizationTileSizes which contains the inferred bounded result
@@ -41,13 +42,25 @@ static std::optional<VectorizationTileSizes> inferSizesFromIR(Value val);
 /// Returns std::nullopt if vector sizes can't be inferred.
 static std::optional<VectorizationTileSizes>
 inferSizesFromIR(linalg::LinalgOp linalgOp, std::optional<OpResult> opResult) {
-  LLVM_DEBUG(VEC_DBGS() << "Inferring sizes for:\n"
-                        << linalgOp << " with OpResult.resultNumber="
-                        << opResult->getResultNumber() << "\n");
+  LLVM_DEBUG({
+    VEC_DBGS() << "Inferring sizes for:\n" << linalgOp;
+    if (opResult) {
+      VEC_DBGS() << " with OpResult.resultNumber="
+                 << opResult->getResultNumber();
+    }
+    VEC_DBGS() << '\n';
+  });
+
+  std::optional<VscaleRange> vscaleRange;
+  if (!opResult) {
+    // Note: Inferring scalable sizes is not supported is `opResult` is set
+    // (which is used to compute sizes for tensor.pack/unpack).
+    auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
+    vscaleRange = getDefaultVscaleRange(targetAttr);
+  }
 
   VectorizationTileSizes result;
   unsigned numDims = linalgOp.getNumLoops();
-
   for (int dim = 0; dim < numDims; ++dim) {
     // Map dimension `dim` to an operand dimension that we will use to
     // traverse the U-D chain to get `dim` vector size information.
@@ -63,22 +76,21 @@ inferSizesFromIR(linalg::LinalgOp linalgOp, std::optional<OpResult> opResult) {
     // Trivial case: `dim` size is available in the operand type.
     int64_t dimSize = llvm::cast<ShapedType>(firstOperand.getType())
                           .getShape()[firstOperandDim];
+    bool dimScalable = false;
     if (!ShapedType::isDynamic(dimSize)) {
       result.vectorSizes.push_back(dimSize);
+      result.vectorScalableFlags.push_back(dimScalable);
       LLVM_DEBUG(VEC_DBGS() << "Inferred iteration size '" << dimSize
                             << "' for dimension '" << dim << "'\n");
       continue;
     }
 
     // Use ValueBounds analysis to infer `dim` size upper bound.
-    FailureOr<int64_t> maybeDimBound;
+    FailureOr<DimBoundSize> maybeDimBound;
     for (auto operandDimPair : operandDimPairs) {
       Value operand = operandDimPair.first;
       unsigned operandDim = operandDimPair.second;
-      maybeDimBound = ValueBoundsConstraintSet::computeConstantBound(
-          presburger::BoundType::UB, {operand, operandDim},
-          /*stopCondition=*/nullptr, /*closedUB=*/true);
-
+      maybeDimBound = computeDimUpperBound(operand, operandDim, vscaleRange);
       if (succeeded(maybeDimBound)) {
         break;
       }
@@ -88,13 +100,19 @@ inferSizesFromIR(linalg::LinalgOp linalgOp, std::optional<OpResult> opResult) {
       return std::nullopt;
     }
 
-    dimSize = maybeDimBound.value();
+    dimSize = maybeDimBound->baseSize;
+    dimScalable = maybeDimBound->scalable;
     result.vectorSizes.push_back(dimSize);
+    result.vectorScalableFlags.push_back(dimScalable);
+
     LLVM_DEBUG(VEC_DBGS() << "Inferred iteration size '" << dimSize
+                          << (dimScalable ? " x vscale" : "")
                           << "' for dimension '" << dim << "'\n");
   }
 
   if (opResult) {
+    assert(!llvm::is_contained(result.vectorScalableFlags, true) &&
+           "inferring scalable bounds with `opResult` not supported!");
     result.destShape = linalgOp.getIndexingMapMatchingResult(opResult.value())
                            .compose(result.vectorSizes);
   }
@@ -244,12 +262,14 @@ getVectorSizes(Operation *op, bool useConfiguredVectorSizes) {
 
   // Try to infer the vector sizes from the IR.
   std::optional<SmallVector<int64_t>> vectorSizes;
+  SmallVector<bool> scalableFlags;
   TypeSwitch<Operation *, void>(op)
       .Case<linalg::LinalgOp>([&](linalg::LinalgOp linalgOp) {
         std::optional<VectorizationTileSizes> result =
             inferSizesFromIR(linalgOp, /*opResult=*/std::nullopt);
         if (result) {
           vectorSizes = result->vectorSizes;
+          scalableFlags = result->vectorScalableFlags;
         }
       })
       .Case<tensor::PackOp, tensor::UnPackOp>([&](auto op) {
@@ -269,9 +289,8 @@ getVectorSizes(Operation *op, bool useConfiguredVectorSizes) {
       .Default([&](Operation *) {});
 
   if (vectorSizes) {
-    // This can't identify scalable flags, so pad them with `false`.
-    return std::make_pair(vectorSizes.value(),
-                          SmallVector<bool>(vectorSizes->size(), false));
+    scalableFlags.resize(vectorSizes->size(), false);
+    return std::make_pair(vectorSizes.value(), scalableFlags);
   }
   return std::nullopt;
 }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
index 6943348fd651..3f0947d43f91 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
@@ -366,3 +366,42 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor<?x?x16x16xf32>, %arg1:
 // CHECK-MASK:           %[[GENERIC_SRC:.+]] = vector.transfer_read %[[UNPACK_WRITE]]{{.+}}, %[[GENERIC_MASK]]
 // CHECK-MASK:           %[[EXP:.+]] = math.exp %[[GENERIC_SRC]]
 // CHECK-MASK:           vector.transfer_write %[[EXP]]{{.+}}, %[[GENERIC_MASK]]
+
+// -----
+
+#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}>
+#map = affine_map<()[s0] -> (-(176 mod s0) + 176)>
+
+func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x67x120x176xf32>) -> tensor<1x67x120x176xf32>
+  attributes {hal.executable.target = #aarch64_sve}
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c67 = arith.constant 67 : index
+  %c120 = arith.constant 120 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %0 = scf.for %arg1 = %c0 to %c67 step %c1 iter_args(%arg2 = %arg0) -> (tensor<1x67x120x176xf32>) {
+    %1 = scf.for %arg3 = %c0 to %c120 step %c4 iter_args(%arg4 = %arg2) -> (tensor<1x67x120x176xf32>) {
+      %2 = affine.apply #map()[%c4_vscale]
+      %3 = scf.for %arg5 = %c0 to %2 step %c4_vscale iter_args(%arg6 = %arg4) -> (tensor<1x67x120x176xf32>) {
+        %extracted_slice = tensor.extract_slice %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x67x120x176xf32> to tensor<1x1x4x?xf32>
+        %4 = linalg.fill ins(%cst : f32) outs(%extracted_slice : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32>
+        %inserted_slice = tensor.insert_slice %4 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x67x120x176xf32>
+        scf.yield %inserted_slice : tensor<1x67x120x176xf32>
+      }
+      scf.yield %3 : tensor<1x67x120x176xf32>
+    }
+    scf.yield %1 : tensor<1x67x120x176xf32>
+  }
+  return %0 : tensor<1x67x120x176xf32>
+}
+
+// CHECK-MASK-LABEL: func.func @dynamic_fill_with_scalable_tiling_infer_vector_size
+// CHECK-MASK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x4x[4]xf32>
+// CHECK-MASK: scf.for
+// CHECK-MASK:   scf.for
+// CHECK-MASK:     scf.for
+// CHECK-MASK:       vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index abeb35d83553..5db070b79c08 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -1157,4 +1157,26 @@ getDefaultVscaleRange(IREE::HAL::ExecutableTargetAttr targetAttr) {
   return std::nullopt;
 }
 
+FailureOr<DimBoundSize>
+computeDimUpperBound(Value shapedValue, unsigned dimNum,
+                     std::optional<VscaleRange> vscaleRange) {
+  if (!vscaleRange.has_value()) {
+    FailureOr<int64_t> maybeDimBoundSize =
+        ValueBoundsConstraintSet::computeConstantBound(
+            presburger::BoundType::UB, {shapedValue, dimNum},
+            /*stopCondition=*/nullptr, /*closedUB=*/true);
+    if (succeeded(maybeDimBoundSize))
+      return DimBoundSize{.baseSize = *maybeDimBoundSize, .scalable = false};
+    return failure();
+  }
+  FailureOr<DimBound> maybeDimBound =
+      vector::ScalableValueBoundsConstraintSet::computeScalableBound(
+          shapedValue, dimNum,
+          /*vscaleMin=*/vscaleRange->min,
+          /*vscaleMax=*/vscaleRange->max, presburger::BoundType::UB);
+  if (succeeded(maybeDimBound))
+    return maybeDimBound->getSize();
+  return failure();
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index 6aa01a3d561c..ae53df006a81 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
@@ -233,6 +234,16 @@ struct VscaleRange {
 std::optional<VscaleRange>
 getDefaultVscaleRange(IREE::HAL::ExecutableTargetAttr targetAttr);
 
+using DimBound = vector::ConstantOrScalableBound;
+using DimBoundSize = DimBound::BoundSize;
+
+/// Computes the upper bound of `dimNum` dim of the ShapedType value
+/// `shapedValue`. If the optional `vscaleRange` is provided then the computed
+/// bound can be a scalable quantity.
+FailureOr<DimBoundSize>
+computeDimUpperBound(Value shapedValue, unsigned dimNum,
+                     std::optional<VscaleRange> vscaleRange);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_UTILS_H_