Xilinx · cmcgirr-amd · Aug 13, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -24,6 +24,7 @@
 #include "torch-mlir/Dialect/Torch/Utils/Utils.h"
 #include "torch-mlir/Dialect/TorchConversion/IR/TorchConversionDialect.h"
 #include "torch-mlir/Dialect/TorchConversion/Transforms/BackendTypeConversion.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include <numeric>
 #include <optional>
 
@@ -1132,6 +1133,34 @@ Type getMatMulOutputType(Type inputElemTy, PatternRewriter &rewriter) {
   return outputElemTy;
 }
 
+RankedTensorType getCastedInputTypeForMatmul(Value inputValue,
+                                             PatternRewriter &rewriter) {
+  // Check to see if the inputs to the matmul where casted from another type
+  auto preCastType =
+      TypeSwitch<Operation *, RankedTensorType>(inputValue.getDefiningOp())
+          .Case([](AtenToDtypeOp op) {
+            return cast<RankedTensorType>(op->getOperand(0).getType());
+          })
+          .Case([](tosa::CastOp op) {
+            return cast<RankedTensorType>(op->getOperand(0).getType());
+          })
+          .Default([](Operation * /*op*/) { return RankedTensorType(); });
+  if (!preCastType) {
+    return preCastType;
+  }
+  // Calculate the expected accumulator type based on the input type of the cast
+  auto accumulatorType =
+      getMatMulOutputType(preCastType.getElementType(), rewriter);
+  // If the expected accumulatorType for the given input type to the cast
+  // matches the output type of the cast then we can fold the casting into the
+  // matmul. Because the casting is an up-cast and does not affect the numeric
+  // values due to rounding or saturation.
+  return accumulatorType ==
+                 cast<RankedTensorType>(inputValue.getType()).getElementType()
+             ? preCastType
+             : RankedTensorType();
+}
+
 // Perform the basic n-dim matmul operation encompassing the handling of
 // broadcasting and dynamic shape propagation.
 // All PyTorch ops that leverage matrix multiplication will derive this and
@@ -1173,6 +1202,28 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
       return rewriter.notifyMatchFailure(op,
                                          "Matmul: input datatypes mismatched");
 
+    // Step: check if the inputs have been casted from a supported input type to
+    // an accumulator type and insert casts back to the original type if true
+    RankedTensorType lhsPreCastedType =
+        getCastedInputTypeForMatmul(lhs, rewriter);
+    RankedTensorType rhsPreCastedType =
+        getCastedInputTypeForMatmul(rhs, rewriter);
+    if (lhsPreCastedType && (lhsPreCastedType.getElementType() ==
+                             rhsPreCastedType.getElementType())) {
+      lhs = rewriter.create<tosa::CastOp>(
+          lhs.getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              lhsPreCastedType),
+          lhs);
+      rhs = rewriter.create<tosa::CastOp>(
+          rhs.getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              rhsPreCastedType),
+          rhs);
+      lhsElemTy = cast<RankedTensorType>(lhsPreCastedType).getElementType();
+      rhsElemTy = cast<RankedTensorType>(rhsPreCastedType).getElementType();
+    }
+
     auto outputElemTy = getMatMulOutputType(lhsElemTy, rewriter);
     if (!outputElemTy) {
       return rewriter.notifyMatchFailure(
@@ -1565,12 +1616,13 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
                 matmulLhs, matmulRhs)
             .getResult();
 
+    auto torchOpOutputType = lhsTy.getElementType();
     auto castOutputTy = RankedTensorType::get(
-        makeShapeLLVMCompatible(matmulOutputShape), lhsElemTy);
+        makeShapeLLVMCompatible(matmulOutputShape), torchOpOutputType);
     auto castResult = rewriter.createOrFold<tosa::CastOp>(
         op->getLoc(),
-        OpConversionPattern<AtenOpT>::getTypeConverter()
-            ->convertType(castOutputTy),
+        OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+            castOutputTy),
         mmOpResult);
 
     // Perform the reshape to output shape. This is always required unless max
@@ -1673,7 +1725,7 @@ class ConvertAtenMatmulBaseOp : public OpConversionPattern<AtenOpT> {
 
       // Perform reshape
       auto reshapedOpType = RankedTensorType::get(
-          makeShapeLLVMCompatible(reshapedOpShape), lhsElemTy);
+          makeShapeLLVMCompatible(reshapedOpShape), outputElemTy);
       auto reshapedOp = rewriter.create<tosa::ReshapeOp>(
           op->getLoc(),
           OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(

diff --git a/test/Conversion/TorchToTosa/basic.mlir b/test/Conversion/TorchToTosa/basic.mlir
@@ -85,6 +85,46 @@ func.func @torch.aten.mm_2d(%arg0 : !torch.vtensor<[2,6],f32>, %arg1 : !torch.vt
 
 // -----
 
+// CHECK: tosa.matmul{{.*}} : (tensor<1x4x8xbf16>, tensor<1x8x16xbf16>) -> tensor<1x4x16xf32>
+func.func @torch.aten.mm_bf16(%arg0: !torch.vtensor<[4,8],bf16>, %arg1: !torch.vtensor<[8,16],bf16>) -> !torch.vtensor<[4,16],f32> {
+  %false = torch.constant.bool false
+  %none = torch.constant.none
+  %int6 = torch.constant.int 6
+  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[4,8],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4,8],f32>
+  %1 = torch.aten.to.dtype %arg1, %int6, %false, %false, %none : !torch.vtensor<[8,16],bf16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[8,16],f32>
+  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[4,8],f32>, !torch.vtensor<[8,16],f32> -> !torch.vtensor<[4,16],f32>
+  return %2 : !torch.vtensor<[4,16],f32>
+}
+
+// -----
+
+// CHECK: tosa.matmul{{.*}} : (tensor<1x4x8xf16>, tensor<1x8x16xf16>) -> tensor<1x4x16xf32>
+func.func @torch.aten.mm_f16(%arg0: !torch.vtensor<[4,8],f16>, %arg1: !torch.vtensor<[8,16],f16>) -> !torch.vtensor<[4,16],f32> {
+  %false = torch.constant.bool false
+  %none = torch.constant.none
+  %int6 = torch.constant.int 6
+  %0 = torch.aten.to.dtype %arg0, %int6, %false, %false, %none : !torch.vtensor<[4,8],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4,8],f32>
+  %1 = torch.aten.to.dtype %arg1, %int6, %false, %false, %none : !torch.vtensor<[8,16],f16>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[8,16],f32>
+  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[4,8],f32>, !torch.vtensor<[8,16],f32> -> !torch.vtensor<[4,16],f32>
+  return %2 : !torch.vtensor<[4,16],f32>
+}
+
+
+// -----
+
+// CHECK: tosa.matmul{{.*}} : (tensor<1x4x8xi8>, tensor<1x8x16xi8>) -> tensor<1x4x16xi32>
+func.func @torch.aten.mm_i8(%arg0: !torch.vtensor<[4,8],si8>, %arg1: !torch.vtensor<[8,16],si8>) -> !torch.vtensor<[4,16],si32> {
+  %false = torch.constant.bool false
+  %none = torch.constant.none
+  %int3 = torch.constant.int 3
+  %0 = torch.aten.to.dtype %arg0, %int3, %false, %false, %none : !torch.vtensor<[4,8],si8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[4,8],si32>
+  %1 = torch.aten.to.dtype %arg1, %int3, %false, %false, %none : !torch.vtensor<[8,16],si8>, !torch.int, !torch.bool, !torch.bool, !torch.none -> !torch.vtensor<[8,16],si32>
+  %2 = torch.aten.mm %0, %1 : !torch.vtensor<[4,8],si32>, !torch.vtensor<[8,16],si32> -> !torch.vtensor<[4,16],si32>
+  return %2 : !torch.vtensor<[4,16],si32>
+}
+
+// -----
+
 //      CHECK: %[[VAL_2:.+]] = tosa.reshape %0 {new_shape = array<i64: 100, 6, 2>} : (tensor<10x10x6x2xf32>) -> tensor<100x6x2xf32>
 // CHECK-NEXT: %[[VAL_3:.+]] = tosa.reshape %1 {new_shape = array<i64: 100, 2, 6>} : (tensor<10x10x2x6xf32>) -> tensor<100x2x6xf32>
 // CHECK-NEXT: %[[VAL_4:.+]] = tosa.matmul %[[VAL_2]], %[[VAL_3]] : (tensor<100x6x2xf32>, tensor<100x2x6xf32>) -> tensor<100x6x6xf32>