Skip to content

Commit

Permalink
Revert "[Flow] Convert from tensor.cast to flow.tensor.reshape early …(
Browse files Browse the repository at this point in the history
…iree-org#18256)" (iree-org#18331)

This reverts commit 1c0c5a6.

This is causing an issue with
https://github.com/iree-org/iree/actions/runs/10505447157/job/29119827242#step:5:2269
iree/tests/e2e/regression/check_regression_llvm-cpu_layernorm.mlir

Triaging the bug I see that one of the dispatches is slightly different
but should result to the same numerics but it does not
Here is how the problem dispatch used to be originally
```
        func.func @old_dispatch2(%11 : tensor<128x384xf32>, %12: tensor<128xf32>) -> tensor<128x384xf32> {
        %cst = arith.constant 5.000000e+00 : f32
        %cst_0 = arith.constant 0.000000e+00 : f32
        %13 = tensor.empty() : tensor<128x384xf32>
        %14 = tensor.empty() : tensor<128xf32>
        %15 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<128xf32>) -> tensor<128xf32>
        %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%11 : tensor<128x384xf32>) outs(%15 : tensor<128xf32>) {
        ^bb0(%in: f32, %out: f32):
          %18 = arith.addf %in, %out : f32
          linalg.yield %18 : f32
        } -> tensor<128xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11, %16, %12 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>) outs(%13 : tensor<128x384xf32>) {
        ^bb0(%og_in : f32, %in: f32, %in_1: f32, %out: f32):
          %18 = arith.mulf %in, %in_1 : f32
          %19 = arith.subf %og_in, %18 : f32
          linalg.yield %19 : f32
        } -> tensor<128x384xf32>
        return %17 :  tensor<128x384xf32>
        }
  ```
  with the reverting PR this dispatch is becoming
  ```
          func.func @new_dispatch2(%11 : tensor<128x384xf32>, %12: tensor<128xf32>) -> tensor<128x384xf32> {
        %cst = arith.constant 5.000000e+00 : f32
        %cst_0 = arith.constant 0.000000e+00 : f32
        %13 = tensor.empty() : tensor<128x384xf32>
        %14 = tensor.empty() : tensor<128xf32>
        %15 = linalg.fill ins(%cst_0 : f32) outs(%14 : tensor<128xf32>) -> tensor<128xf32>
        %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%11 : tensor<128x384xf32>) outs(%15 : tensor<128xf32>) {
        ^bb0(%in: f32, %out: f32):
          %18 = arith.addf %in, %out : f32
          linalg.yield %18 : f32
        } -> tensor<128xf32>
        %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16, %12 : tensor<128xf32>, tensor<128xf32>) outs(%13 : tensor<128x384xf32>) {
        ^bb0(%in: f32, %in_1: f32, %out: f32):
          %18 = arith.mulf %in, %in_1 : f32
          %19 = arith.subf %cst, %18 : f32
          linalg.yield %19 : f32
        } -> tensor<128x384xf32>
        return %17 :  tensor<128x384xf32>
        }
   ```
   The difference is at `%19 = arith.subf %cst, %18 : f32`
   Note that in both cases `%11 : tensor<128x384xf32>` is `5.0` from the model input and hence the output *should* be same, however on `arch64` it is not but on x86 it is, the IR at the mlir llvm dialect is identicalbetween x86 and arch64 so its not some easy to  spot intrinsics / fast math kind of bug AFAICS
  • Loading branch information
nirvedhmeshram authored Aug 22, 2024
1 parent 588732c commit 8da4564
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,4 @@ void populateTensorToFlowConversionPatterns(MLIRContext *context,
ConvertTensorReshapePattern<tensor::ExpandShapeOp>>(context);
}

void populateTensorDialectCastOpPattern(MLIRContext *context,
RewritePatternSet &patterns) {
patterns.insert<ConvertTensorCastPattern>(context);
}

} // namespace mlir::iree_compiler::IREE::Flow
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ namespace mlir::iree_compiler::IREE::Flow {
void populateTensorToFlowConversionPatterns(MLIRContext *context,
RewritePatternSet &patterns);

// Add pattern to convert tensor.cast -> flow.tensor.reshape.
void populateTensorDialectCastOpPattern(MLIRContext *context,
RewritePatternSet &patterns);

} // namespace mlir::iree_compiler::IREE::Flow

#endif // IREE_COMPILER_DIALECT_FLOW_CONVERSION_TENSORTOFLOW_PATTERNS_H_
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Dialect/Flow/Conversion/TensorToFlow/Patterns.h"
#include "iree/compiler/Dialect/Flow/Transforms/Passes.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
Expand Down Expand Up @@ -104,11 +103,6 @@ struct CanonicalizerPass
CanonicalizerPass>::CanonicalizerPassBase;
/// Initialize the canonicalizer by building the set of patterns used during
/// execution.

void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<IREE::Flow::FlowDialect>();
}

LogicalResult initialize(MLIRContext *context) override {
// Inherit the same config defaults from the upstream canonicalizer pass.
config.useTopDownTraversal = true;
Expand All @@ -123,7 +117,6 @@ struct CanonicalizerPass
// Pull in some borderline/downstream canonicalizations for the Flow
// compilation phase.
tensor::populateMergeConsecutiveInsertExtractSlicePatterns(owningPatterns);
IREE::Flow::populateTensorDialectCastOpPattern(context, owningPatterns);
owningPatterns.add<FoldConsecutiveConstantPadding>(context);

patterns =
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-canonicalize, iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-convert-dispatch-regions-to-workgroups, iree-flow-convert-tensor-to-flow, canonicalize, iree-flow-materialize-default-workgroup-count-region), cse, iree-flow-canonicalize, cse)" %s | FileCheck %s
// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-convert-dispatch-regions-to-workgroups, iree-flow-convert-tensor-to-flow, canonicalize, iree-flow-materialize-default-workgroup-count-region), cse, iree-flow-canonicalize, cse)" %s | FileCheck %s
util.func public @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
%1 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
Expand Down Expand Up @@ -231,17 +231,17 @@ util.func public @always_fuse_cast
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: tensor<4x?xf32>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
// CHECK-DAG: %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
// CHECK: %[[RESHAPE:.*]] = flow.tensor.reshape %[[ARG0]]
// CHECK-SAME: tensor<?x?xf32>{%[[M]], %[[C4]]} -> tensor<?x4xf32>{%[[M]]}
// CHECK-DAG: %[[N1:.+]] = tensor.dim %[[ARG1]], %[[C1]]
// CHECK: %[[RESULT1:.+]] = flow.dispatch.workgroups[%[[N1]], %[[M]]]
// CHECK-SAME: (%[[RESHAPE]], %[[ARG1]], %[[N1]], %[[M]])
// CHECK-DAG: %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
// CHECK: %[[RESULT1:.+]] = flow.dispatch.workgroups[%[[M]], %[[K]], %[[N1]]]
// CHECK-SAME: (%[[ARG0]], %[[ARG1]], %[[M]], %[[K]], %[[N1]])
// CHECK: tensor.cast
// CHECK: flow.return
// CHECK-DAG: %[[N2:.+]] = tensor.dim %[[ARG2]], %[[C1]]
// CHECK: %[[RESULT2:.+]] = flow.dispatch.workgroups[%[[N2]], %[[M]]]
// CHECK-SAME: (%[[RESHAPE]], %[[ARG2]], %[[N2]], %[[M]])
// CHECK: %[[RESULT2:.+]] = flow.dispatch.workgroups[%[[M]], %[[K]], %[[N2]]]
// CHECK-SAME: (%[[ARG0]], %[[ARG2]], %[[M]], %[[K]], %[[N2]])
// CHECK: tensor.cast
// CHECK: flow.return
// CHECK: util.return %[[RESULT1]], %[[RESULT2]]

Expand Down Expand Up @@ -512,21 +512,26 @@ util.func public @inline_dag_1(
// CHECK-NOT: linalg.
// CHECK-NOT: tensor.extract_slice
// CHECK: flow.dispatch.workgroups
// CHECK-NEXT: %[[ARG4:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>
// CHECK-SAME: %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<i32>>
// CHECK-SAME: %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>
// CHECK-NEXT: %[[ARG4:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>
// CHECK-SAME: %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>
// CHECK-SAME: %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<i32>>
// CHECK-SAME: %[[ARG7:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG8:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG9:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG10:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<1x?xf32>>
// CHECK-DAG: %[[LEAF1:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 0]
// CHECK-DAG: %[[LEAF2:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 10]
// CHECK-DAG: %[[LEAF3:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 20]
// CHECK-DAG: %[[LEAF4:.+]] = flow.dispatch.tensor.load %[[ARG5]]
// CHECK-DAG: %[[LEAF5:.+]] = flow.dispatch.tensor.load %[[ARG6]]
// CHECK-DAG: %[[INIT:.+]] = tensor.empty
// CHECK-SAME: %[[ARG10:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG11:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG12:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<1x?xf32>>
// CHECK: %[[LEAF1:.+]] = flow.dispatch.tensor.load %[[ARG4]]
// CHECK: %[[LEAF2:.+]] = flow.dispatch.tensor.load %[[ARG5]]
// CHECK: %[[LEAF3:.+]] = flow.dispatch.tensor.load %[[ARG6]]
// CHECK: %[[INIT:.+]] = tensor.empty
// CHECK-DAG: %[[OP1:.+]] = tensor.cast %[[LEAF1]]
// CHECK-DAG: %[[OP2:.+]] = tensor.cast %[[LEAF2]]
// CHECK-DAG: %[[OP3:.+]] = tensor.extract_slice %[[OP1]][0, 0]
// CHECK-DAG: %[[OP4:.+]] = tensor.extract_slice %[[OP1]][0, 10]
// CHECK-DAG: %[[OP5:.+]] = tensor.extract_slice %[[OP1]][0, 20]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LEAF4]], %[[LEAF3]], %[[LEAF5]], %[[LEAF2]], %[[LEAF1]] :
// CHECK-SAME: ins(%[[LEAF3]], %[[OP5]], %[[OP2]], %[[OP4]], %[[OP3]] :
// CHECK-SAME: outs(%[[INIT]] :

// -----
Expand Down Expand Up @@ -567,21 +572,24 @@ util.func public @inline_dag_2(
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<1x?xf32>
// CHECK: flow.dispatch.workgroups
// CHECK-NEXT: %[[ARG4:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>
// CHECK-SAME: %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<i32>>
// CHECK-SAME: %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>
// CHECK-NEXT: %[[ARG4:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>
// CHECK-SAME: %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>
// CHECK-SAME: %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readonly:tensor<i32>>
// CHECK-SAME: %[[ARG7:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG8:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG9:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG10:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<1x?xf32>>
// CHECK-DAG: %[[LEAF1:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 0]
// CHECK-DAG: %[[LEAF2:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 10]
// CHECK-DAG: %[[LEAF3:.+]] = flow.dispatch.tensor.load %[[ARG4]], offsets = [0, 20]
// CHECK-DAG: %[[LEAF4:.+]] = flow.dispatch.tensor.load %[[ARG5]], {{.*}}
// CHECK-DAG: %[[LEAF5:.+]] = flow.dispatch.tensor.load %[[ARG6]], {{.*}}
// CHECK-DAG: %[[INIT:.+]] = tensor.empty
// CHECK-SAME: %[[ARG10:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG11:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<1x?xf32>>
// CHECK: %[[LEAF1:.+]] = flow.dispatch.tensor.load %[[ARG4]], {{.*}}
// CHECK: %[[LEAF2:.+]] = flow.dispatch.tensor.load %[[ARG5]], {{.*}}
// CHECK: %[[LEAF3:.+]] = flow.dispatch.tensor.load %[[ARG6]], {{.*}}
// CHECK: %[[INIT:.+]] = tensor.empty
// CHECK-DAG: %[[OP1:.+]] = tensor.cast %[[LEAF1]]
// CHECK-DAG: %[[OP3:.+]] = tensor.extract_slice %[[OP1]][0, 0]
// CHECK-DAG: %[[OP4:.+]] = tensor.extract_slice %[[OP1]][0, 10]
// CHECK-DAG: %[[OP5:.+]] = tensor.extract_slice %[[OP1]][0, 20]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LEAF4]], %[[LEAF3]], %[[LEAF5]], %[[LEAF2]], %[[LEAF1]] :
// CHECK-SAME: ins(%[[LEAF3]], %[[OP5]], %[[LEAF2]], %[[OP4]], %[[OP3]] :
// CHECK-SAME: outs(%[[INIT]] :

// -----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,27 +82,3 @@ util.func public @dont_merge_constant_padding_different_vals(
// CHECK-LABEL: util.func public @dont_merge_constant_padding_different_vals
// CHECK: tensor.pad
// CHECK: tensor.pad

// -----

util.func public @tensor_cast_to_reshape(%reshape_17 : tensor<?x?x?x?xf32>, %65 : tensor<?x12x?x64xf32>, %0 : index, %1 : index) -> tensor<?x?x?x?xf32> {
%cast = tensor.cast %reshape_17 : tensor<?x?x?x?xf32> to tensor<?x?x12x64xf32>
%66 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>],
iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
ins(%cast : tensor<?x?x12x64xf32>) outs(%65 : tensor<?x12x?x64xf32>) {
^bb0(%in: f32, %out: f32):
linalg.yield %in : f32
} -> tensor<?x12x?x64xf32>
%cast_18 = tensor.cast %66 : tensor<?x12x?x64xf32> to tensor<?x?x?x?xf32>
util.return %cast_18 : tensor<?x?x?x?xf32>
}

// CHECK-LABEL: util.func public @tensor_cast_to_reshape
// CHECK: flow.tensor.reshape
// CHECK-SAME: tensor<?x?x?x?xf32>
// CHECK-SAME: -> tensor<?x?x12x64xf32>
// CHECK: linalg.generic
// CHECK: flow.tensor.reshape
// CHECK-SAME: tensor<?x12x?x64xf32>
// CHECK-SAME: -> tensor<?x?x?x?xf32>

0 comments on commit 8da4564

Please sign in to comment.